In [1]:
import pandas as pd
import numpy as np
# custom made functions (can be a pain to import sometimes)
import sys
sys.path.insert(0,os.path.abspath('../src/helper'))
from customPandas import *
#saving the model
import pickle
import random


## Loading the data

In [2]:
# https://stackoverflow.com/questions/39125532/file-does-not-exist-in-jupyter-notebook
dataPath = os.path.abspath('../data')
fileName = 'ramen-ratings.csv'
df = pd.read_csv(f'{dataPath}/{fileName}') 

In [17]:
df.head()

Unnamed: 0,Review #,Brand,Variety,Style,Country,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,
4,2576,Ching's Secret,Singapore Curry,Pack,India,


## Separating the results

In [3]:
def cleanStars(value):
    if value == 'Unrated':
        return np.nan
    else:
        return value
df.Stars = df.Stars.apply(cleanStars)
# changing an incorrect dtype
df = df.astype({'Stars':'float64'}) 
df[df.Stars == 'Unrated'].Stars

Series([], Name: Stars, dtype: float64)

In [4]:
Y = df['Stars']
df = df.drop(['Stars'],axis=1)

## Learning more about custom pipeline

In [5]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [6]:
# Review # = 0, Top ten = 5
def dropUselessFeatures(col):
    return

In [7]:
cleanStyle = ColumnTransformer(transformers=[ 
('dropUselessFeatures', FunctionTransformer(dropUselessFeatures, validate=False),[1,2,3,4])], remainder='drop')

In [13]:
# you can view the effect of your custome pipeline using fit_transform
df2 = pd.DataFrame(cleanStyle.fit_transform(df))
totalPercentageNullData(df2)

## Word2Vec NN creation
[source](https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb)

In [33]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
tokenized_corpus[3:6]

[['GGE', 'Ramen', 'Snack', 'Tomato', 'Flavor'],
 ['Singapore', 'Curry'],
 ['Kimchi', 'song', 'Song', 'Ramen']]

In [65]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
#to convert back from indices to words
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
vocabulary_size

1567

In [47]:
from itertools import islice
# https://stackoverflow.com/questions/7971618/return-first-n-keyvalue-pairs-from-dict
# I just want to see the first n items in a dict
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
n_items = take(10, idx2word.items())
n_items

[(0, "T's"),
 (1, 'Restaurant'),
 (2, 'Tantanmen'),
 (3, 'Noodles'),
 (4, 'Spicy'),
 (5, 'Hot'),
 (6, 'Sesame'),
 (7, 'Guan-miao'),
 (8, 'Cup'),
 (9, 'Chicken')]

In [66]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
idx_pairs[:10] 

array([[0, 1],
       [0, 2],
       [1, 0],
       [1, 2],
       [2, 0],
       [2, 1],
       [3, 4],
       [3, 5],
       [4, 3],
       [4, 5]])

In [67]:
[idx2word[i] for item in idx_pairs for i in item][:10]

["T's",
 'Restaurant',
 "T's",
 'Tantanmen',
 'Restaurant',
 "T's",
 'Restaurant',
 'Tantanmen',
 'Tantanmen',
 "T's"]

## Neural Network
[source](https://gist.github.com/mbednarski/c24c683fa7d4f2148fc5fdfc51246b91)

In [91]:
import torch
from torch.autograd import Variable, profiler
import torch.functional as F
import torch.nn.functional as F
from tqdm import tqdm

In [87]:
def get_word_embedding(word):
    word_vec_one_hot = np.zeros(vocabulary_size)
    word_vec_one_hot[word2idx[word]] = 1
    return word_vec_one_hot

In [88]:
embedding_dims = 10
window_size = 2

In [89]:
def train_generator():
    for sentence in corpus:
        words = sentence.split()
        indices = [word2idx[w] for w in words]
        for i in range(len(indices)):
            # center word, context
            # i is center word index
            for w in range(-window_size, window_size + 1):
                context_idx = i + w
                if context_idx < 0 or context_idx >= len(indices) or i == context_idx:
                    continue
                center_vec_one_hot = np.zeros(vocabulary_size)
                center_vec_one_hot[indices[i]] = 1
                
                context_idx = indices[context_idx]
                yield center_vec_one_hot, context_idx

In [93]:
# Network definition
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)


for epo in tqdm(range(501)):
    avg_loss = 0
    samples = 0
    for data, target in train_generator():
        x = Variable(torch.from_numpy(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        samples += len(y_true)
        
        a1 = torch.matmul(W1, x)
        a2 = torch.matmul(W2, a1)

        log_softmax = F.log_softmax(a2, dim=0)

        network_pred_dist = F.softmax(log_softmax, dim=0)
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        avg_loss += loss.item()
        loss.backward()

        W1.data -= 0.01 * W1.grad.data
        W2.data -= 0.01 * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
    # if epo % 50 == 0:
    #     print(avg_loss / samples)

100%|██████████| 501/501 [1:18:11<00:00,  9.36s/it]


In [114]:
W1.shape

torch.Size([10, 1567])

In [108]:
modelPath = os.path.abspath('../src/models')
modelPath

'/home/henri/Documents/Post Lighthouse-Lab work/KaggleML/ramen-ratings/src/models'

In [126]:
%matplotlib inline
from scikitplot.decomposition import plot_pca_2d_projection
from sklearn.decomposition import PCA

# pca = PCA(n_components=2)
# pca.fit(W1.data.numpy().T)
# proj = pca.transform(W1.data.numpy().T)
# ax = plot_pca_2d_projection(pca, W1.data.numpy().T, np.array(vocabulary), figsize=(12,12),feature_labels=None, text_fontsize=12) #feature_labels=vocabulary,
# ax.legend(None)
# for i, txt in enumerate(words):
#     ax.annotate(txt, (proj[i,0], proj[i,1]), size=16)

In [110]:
torch.save(W1, f'{modelPath}/W1.sav')
torch.save(W2, f'{modelPath}/W2.sav')

In [None]:
#load
#the_model = torch.load(PATH)

In [127]:
def get_word_vector_v(word):
    return W1[:, word2idx[word]].data.numpy()

def get_word_vector_u(word):
    return W2[word2idx[word],:].data.numpy()

In [None]:
# Poland to Warsaw is like Germany to ?

In [None]:
random.choice(df[i])

In [135]:
from scipy.spatial.distance import cosine
distances = [(v, cosine(yyy, 1 * get_word_vector_u(v) + 1 * get_word_vector_v(v))) for v in vocabulary]

In [137]:
distances[:5]

[("T's", 1.0273612160235643),
 ('Restaurant', 0.8840634450316429),
 ('Tantanmen', 0.9146503508090973),
 ('Noodles', 0.605650007724762),
 ('Spicy', 0.6084803342819214)]

In [153]:
randomIngredient = random.choice(vocabulary)
context_to_predict = get_word_vector_v(randomIngredient)
hidden = Variable(torch.from_numpy(context_to_predict)).float()
a = torch.matmul(W2, hidden)
probs = F.softmax(a, dim=0).data.numpy()
print(f'The random ingredient choosen is {randomIngredient}')
for context, prob in zip(words, probs):
    if prob >.05:
        print(f'{context}: {prob:.2f}')

The random ingredient choosen is Artificially
chicken: 0.47
spice: 0.08
mushroom: 0.05
delight: 0.13
