In [1]:
import torch
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import json
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import texthero as hero
from texthero import preprocessing
import plotly.express as px
from texthero import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_data = pd.read_json(r"/home/shah/Desktop/FB-Marketplace-Recommendation-Ranking-System/data/products_table.json")

In [None]:
text_data

In [3]:
text_data['product_description'] = hero.clean(text_data['product_description'])

In [4]:
custom_pipeline = [preprocessing.fillna,
                   #preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_diacritics
                   #preprocessing.remove_brackets
                  ]
text_data['product_description'] = hero.clean(text_data['product_description'], custom_pipeline)
text_data['product_description'] = [n.replace('{','') for n in text_data['product_description']]

In [5]:
new_data = text_data.drop(['product_name', 'price', 'location',
       'page_id', 'create_time', 'category'], axis=1, inplace=False)

In [6]:
new_data

Unnamed: 0,id,product_description
0,ac2140ae-f0d5-4fe7-ac08-df0f109fd734,n
1,243809c0-9cfc-4486-ad12-3b7a16605ba9,mirror wall art posted nisha dining living roo...
2,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,morphy richard model stainless steel tier stac...
3,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,collection drive PS ono
4,59948726-29be-4b35-ade5-bb2fd7331856,great reclaimed army ammunition box used coffe...
...,...,...
8086,564e3411-768d-4250-a624-b119d696f103,playstation vr v2 bundle355cash collection del...
8087,4bace77b-1c58-4ae5-b72f-10aead62f817,n
8088,2b0a652b-46a2-4297-b619-5efeeb222787,pick PS250comes two pistols stocks gun games p...
8089,719fd40a-870e-4144-b324-55dff2e66fb4,bought christmas currys retailing PS used alon...


In [None]:
######################################################################
def remove_n_a_rows(df, column: str):
    # Swap N/A for the pandas nan, so we can drop them
    temp_df = df[column].replace('N/A', np.nan)
    temp_df = temp_df.dropna()
    # Create a new df with only the records without the nans
    clean_df = pd.merge(temp_df, df,
                            left_index=True, right_index=True)
    # The merge creates a duplicate column. Remove it.
    clean_df.drop(column + '_x', inplace=True, axis=1)
    # Rename the remaining category column
    clean_df.rename(columns={column + '_y': column}, inplace=True)
    # Commit the cleansed data to the dataframe
    df = clean_df
    return df

new_text = remove_n_a_rows(new_text, 'product_description')
text_data = text_data.dropna()
text_data = text_data.reset_index(drop=True)
########################################################################

In [7]:
tw = hero.visualization.top_words(new_data['product_description']).head(10)

fig = px.bar(tw)
fig.show()
tw.head()

PS            4634
condition     2744
new           2549
collection    1814
x             1777
Name: product_description, dtype: int64

In [8]:
new_data['TFIDF'] = (hero.tfidf(new_data['product_description'], max_features=3000))

In [9]:
new_data

Unnamed: 0,id,product_description,TFIDF
0,ac2140ae-f0d5-4fe7-ac08-df0f109fd734,n,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,243809c0-9cfc-4486-ad12-3b7a16605ba9,mirror wall art posted nisha dining living roo...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,morphy richard model stainless steel tier stac...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,collection drive PS ono,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,59948726-29be-4b35-ade5-bb2fd7331856,great reclaimed army ammunition box used coffe...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
8086,564e3411-768d-4250-a624-b119d696f103,playstation vr v2 bundle355cash collection del...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8087,4bace77b-1c58-4ae5-b72f-10aead62f817,n,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8088,2b0a652b-46a2-4297-b619-5efeeb222787,pick PS250comes two pistols stocks gun games p...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8089,719fd40a-870e-4144-b324-55dff2e66fb4,bought christmas currys retailing PS used alon...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
from gensim.models.word2vec import Word2Vec

model = Word2Vec(vector_size=64, min_count=1, epochs = 20)

card2vec = [model.infer_vector((new_data['product_description'][i].split(' ')))
            for i in range(0, len(new_data['product_description']))]

#create a list
w2v = np.array(card2vec).tolist()

#set list to dataframe column
new_data['product_description'] = w2v

TypeError: __init__() got an unexpected keyword argument 'vector_size'

In [10]:
def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split() 
        input = [word2id[n] for n in word[:-1]]
        target = word2id[word[-1]] 

        input_batch.append(input)
        target_batch.append(target)

    print(input_batch)
    print(target_batch)

In [11]:
word_list = [(new_data['product_description'][i].split(' '))
            for i in range(0, len(new_data['product_description']))]
word_list = list(word_list)
word2id = {w: i for i, w in enumerate(word_list)}
id2word = {i: w for i, w in enumerate(word_list)}
n_class = len(word2id)

n_class

TypeError: unhashable type: 'list'

In [None]:
word2id

In [None]:
#building the model
class NNLM(nn.Module):
   def __init__(self):
       super(NNLM, self).__init__()
       self.embeddings = nn.Embedding(n_class, m) #embedding layer or look up table

       self.hidden1 = nn.Linear(n_step * m, n_hidden, bias=False)
       self.ones = nn.Parameter(torch.ones(n_hidden))
      
       self.hidden2 = nn.Linear(n_hidden, n_class, bias=False)
       self.hidden3 = nn.Linear(n_step * m, n_class, bias=False) #final layer
      
       self.bias = nn.Parameter(torch.ones(n_class))

   def forward(self, X):
       X = self.embeddings(X) # embeddings
       X = X.view(-1, n_step * m) # first layer
       tanh = torch.tanh(self.d + self.hidden1(X)) # tanh layer
       output = self.b + self.hidden3(X) + self.hidden2(tanh) # summing up all the layers with bias
       return output

In [None]:
n_step = 2
n_hidden = 2
m = 2

In [None]:
model = NNLM()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
input_batch, target_batch = make_batch(text_data['product_description'])
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)

In [None]:
for epoch in range(5000):
    optimizer.zero_grad()
    embeddings, output = model(input_batch)

    # output : [batch_size, n_class], target_batch : [batch_size]
    loss = criterion(output, target_batch)
    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

# Test
print([sen.split()[:2] for sen in new_text['product_description']], '->', [id2word[n.item()] for n in predict.squeeze()])

In [None]:
#train the model using word2vec