### Import Libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.optimizers import Adam

### Dataset information
* Product Name
* Rent Price
* Product Photo
* Renters
* Description

In [4]:
dataset = pd.read_csv('../data/fathanah/cleaned_dataset.csv')
dataset.head()

Unnamed: 0,product_name,rent_price,url_photo,link,renters,description,hiking,cosplay
0,Aksesoris Fashion Topeng Cosplay Pesta Hallowe...,3650.0,https://images.tokopedia.net/img/cache/200-squ...,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,LenkaWeddingShop,"𝐑𝐄𝐀𝐃𝐘 𝐒𝐓𝐎𝐊, 𝐋𝐀𝐍𝐆𝐒𝐔𝐍𝐆 𝐊𝐈𝐑𝐈𝐌 !! 𝐏𝐑𝐈𝐕𝐀𝐒𝐈 𝐃𝐈𝐉𝐀𝐌𝐈𝐍...",0,1
1,Topeng Pesta Unisex Cosplay Aksesoris Party Pu...,3650.0,https://images.tokopedia.net/img/cache/200-squ...,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,LenkaWeddingShop,"𝐑𝐄𝐀𝐃𝐘 𝐒𝐓𝐎𝐊, 𝐋𝐀𝐍𝐆𝐒𝐔𝐍𝐆 𝐊𝐈𝐑𝐈𝐌 !! 𝐏𝐑𝐈𝐕𝐀𝐒𝐈 𝐃𝐈𝐉𝐀𝐌𝐈𝐍...",0,1
2,Aksesoris Pesta Unisex Topeng Cosplay Hallowee...,3200.0,https://images.tokopedia.net/img/cache/200-squ...,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,LenkaWeddingShop,"𝐑𝐄𝐀𝐃𝐘 𝐒𝐓𝐎𝐊, 𝐋𝐀𝐍𝐆𝐒𝐔𝐍𝐆 𝐊𝐈𝐑𝐈𝐌 !! 𝐏𝐑𝐈𝐕𝐀𝐒𝐈 𝐃𝐈𝐉𝐀𝐌𝐈𝐍...",0,1
3,2pcs Telinga Elf Kuping Palsu Elf Ears Aksesor...,1900.0,https://images.tokopedia.net/img/cache/200-squ...,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,ITOKOTOO,"😄Selamat datang, Anda dapat menghubungi kami j...",0,1
4,Ekor Kucing Cosplay Cat Cosplay Kawaii Cute Ha...,3500.0,https://images.tokopedia.net/img/cache/200-squ...,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,Mireading,"HI, welcome to our store - We provide various ...",0,1


In [3]:
print("Number of records: ", dataset.shape[0])
print("Number of fields: ", dataset.shape[1])

Number of records:  5377
Number of fields:  8


### Display products names and preprocess them

In [4]:
dataset['product_name']

0       Aksesoris Fashion Topeng Cosplay Pesta Hallowe...
1       Topeng Pesta Unisex Cosplay Aksesoris Party Pu...
2       Aksesoris Pesta Unisex Topeng Cosplay Hallowee...
3       2pcs Telinga Elf Kuping Palsu Elf Ears Aksesor...
4       Ekor Kucing Cosplay Cat Cosplay Kawaii Cute Ha...
                              ...                        
5372    Dunlopillo Hooded Thermal Blanket ( Selimut To...
5373    TERMURAH KING RABBIT THERMAL BLANKET SELIMUT F...
5374    NEW PRODUK KING RABBIT THERMAL BLANKET FLEECE ...
5375    dunlopillo thermal & travel blanket black seli...
5376    TaffSPORT Selimut Darurat Emergency Blanket Th...
Name: product_name, Length: 5377, dtype: object

Removing unwanted characters and words in product name

In [5]:
dataset['product_name'] = dataset['product_name'].apply(lambda x: x.replace(u'\xa0',u' '))
dataset['product_name'] = dataset['product_name'].apply(lambda x: x.replace('\u200a',' '))
dataset['product_name'] = dataset['product_name'].str.lower()
corpus = dataset['product_name']

### Callbacks

In [6]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.91):
            print("\nDesired value is already achieved!")
            self.model.stop_training = True

### Tokenization

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset['product_name'])
total_words = len (tokenizer.word_index)+1

# n_gram sequence
def n_gram_seqs(corpus, tokenizer):

    input_sequences = []

    for line in corpus:
      token_list = tokenizer.texts_to_sequences([line])[0]
      for i in range(1,len(token_list)):
        n_gram_sequences = token_list[:i+1]
        input_sequences.append(n_gram_sequences)
    
    return input_sequences

input_sequences = n_gram_seqs(corpus, tokenizer)
max_sequence_len = max([len(x) for x in input_sequences])


### Padding

In [8]:
# function padded
def pad_seqs(input_sequences, maxlen):
    padded_sequences = pad_sequences(input_sequences, maxlen = maxlen)
    return padded_sequences

input_sequences = pad_seqs(input_sequences, max_sequence_len)
print(f"padded corpus has shape: {input_sequences.shape}")

padded corpus has shape: (47357, 39)


In [9]:
# feature and label
def features_and_labels(input_sequences, total_words):
    features = input_sequences[:,:-1]
    labels = input_sequences[:,-1]
    one_hot_labels = to_categorical(labels,num_classes = total_words)
    return features, one_hot_labels
features, labels = features_and_labels(input_sequences, total_words)

### Model

In [10]:
def create_model(total_words, max_sequence_len):
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(total_words, activation="softmax"))

    model.compile(loss="categorical_crossentropy",
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(features, labels, callbacks=[myCallback()], epochs=30, verbose=1)
    return model, history


In [None]:
model = create_model(total_words, max_sequence_len)
model.fit(features, labels, callbacks=myCallback(), epochs=30, verbose=1)

In [None]:
import time
saved_model_path = "./{}.h5".format(int(time.time()))

model.save(saved_model_path)

In [None]:
!tensorflowjs_converter --input_format=keras {saved_model_path} ./

In [None]:
!zip submission.zip *.bin model.json

### Predicting Reccomendation Arrays

In [None]:
# CONTENT_RECOMMENDATION

# embedding input
def get_embeddings(model, tokenizer, texts, max_sequence_len):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len-1)
    embeddings = model.predict(padded_sequences)
    return embeddings

product_embeddings = get_embeddings(model, tokenizer, data['product_name'], max_sequence_len)

# input comparison simmilarities
def recommend_products(user_input, model, tokenizer, product_embeddings, data, max_sequence_len):
    input_seq = tokenizer.texts_to_sequences([user_input])
    input_padded = pad_sequences(input_seq, maxlen=max_sequence_len-1)
    input_embedding = model.predict(input_padded)

    similarities = cosine_similarity(input_embedding, product_embeddings)
    similar_indices = similarities.argsort()[0][::-1]

    recommendations = data.iloc[similar_indices][['product_name', 'rent_price', 'url_photo', 'link']]
    return recommendations

In [None]:
tulis = input("Masukkan Keyword: ")
recommendations = recommend_products(tulis, model, tokenizer, product_embeddings, data, max_sequence_len)
print(recommendations)

### Plotting model accuracy and loss

In [12]:
def plot_metrics(history):
    acc = history.history['accuracy']
    loss = history.history['loss']
    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))

    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'bo-', label='Training accuracy')
    plt.title('Training accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'bo-', label='Training loss')
    plt.title('Training loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()

plot_metrics(history)