In [6]:
import random
import re
from io import StringIO

import nltk
import numpy as np
import pandas as pd
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk.corpus import stopwords
from tqdm.notebook import tqdm_notebook as tqdm

stop = stopwords.words('english')
tqdm.pandas()

# Classes
In this section, we build classes which are later used to load and clean data. We moved this to the top of the notebook to declutter the following section and to make it easier for you to follow our workflow.

## Data-downloading class
It has three metods: <br>
<ul>
    <li><b>__init__(url):</b> Class constructor, takes an URL to GDrive;</li>
    <li><b>load_csv():</b> Downloads a .csv from the GDrive link and returns it as pandas dataframe;</li>
    <li><b>load_txt():</b> Downloads a .txt file from the GDrive link and returns it as a string.</li>
</ul>

In [7]:
class Downloader:
    def __init__(self, gdrive_url):
        self.path = gdrive_url   
    def load_csv(self):
        file_id = self.path.split('/')[-2]
        dwn_url='https://drive.google.com/uc?export=download&id=' + file_id
        url = requests.get(dwn_url).content
        csv_raw = StringIO(url.decode('utf-8'))
        df_ta = pd.read_csv(csv_raw)
        return(df_ta)
    def load_txt(self):
        file_id = self.path.split('/')[-2]
        dwn_url='https://drive.google.com/uc?export=download&id=' + file_id
        #print(dwn_url)
        url = requests.get(dwn_url).content
        scifi = StringIO(url.decode('utf-8')).getvalue()
        return(scifi)

## Data-cleaner class
As suggested in McMahan (Natural Language Processing with PyTorch), we have proceeded by lower-casing each sentence, by removing the punctuation and the non alphabetic characters, also the stopwords have been stripped away from the text. <br>
The strings get lowercased in order to avoid seeing the words at the beginning of the phrases as different words. <br>
Whitespaces are used to tokenize the strings.<br>
This class has three methods and no constructor except from the default one: <br>
<ul>
    <li><b>remove_nonalpha_chars:</b> Takes as input a pandas dataframe and the name of a column, proceeds to remove all non alpha-chars from said column. Returns the cleaned dataframe;</li>
    <li><b>lower_casing():</b> Takes a pandas dataframe and the name of the column to operate on, all the uppercase chars become lowercase. Returns the lowercased dataframe;</li>
    <li><b>remove_stopwords():</b> Removes all the stopwords from the column of a dataframe, both fields of the function. It returns the cleaned dataframe.</li>
</ul>

In [4]:
class DataCleaner:
    def remove_nonalpha_chars(self, df, column):
        df_out = df.copy(deep=True)
        df_out[column] = df_out[column].progress_apply(lambda x: re.sub(r"\s*[^A-Za-z]+\s*", " ", x))
        return df_out
    def lower_casing(self, df, column):
        df_out = df.copy(deep=True)
        df_out[column] = df_out[column].progress_apply(lambda x: x.lower())
        return df_out 
    def remove_stopwords(self, df, column):
        df_out = df.copy(deep=True)
        df_out[column] = df_out[column].progress_apply(lambda x: ' '.join([word for word in (x.split()) if word not in (stop) and len(word) != 1]))
        return df_out

# CBOW_Vectorizers

In [8]:
class VectorizerCBOW: 
    def vectorize(self, context_size, corpus):        
        # first, extract the context words and the corresponding central words
        data = []
        corpus_splt = corpus.split()
        for i in tqdm(range(2, len(corpus_splt) - 2)):
            context = [corpus_splt[i - 2], corpus_splt[i - 1],
                       corpus_splt[i + 1], corpus_splt[i + 2]]
            target = corpus_splt[i]
            data.append((context, target))   
        return data

In [9]:
class VectorizerCBOWContextWindow:
    def __init__(self, context_width):
        self.width = context_width
    def vectorize(self, context_size, corpus):
        data = []
        corpus_splt = corpus.split()
        for i in tqdm(range(self.width, len(corpus_splt) - self.width)):
            context = corpus_splt[(i - self.width):(i + self.width + 1)]
            context.remove(corpus_splt[i])
            target = corpus_splt[i]
            data.append((context, target))   
        return data

# Word_embedder

In [11]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        print(str(self.embeddings))
        #self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear1 = nn.Linear(context_size * 2 * embedding_dim, 128)
        #self.linear1 = nn.Linear(200, 1)
        #print("Cont_size * emb_dim: ", context_size * embedding_dim)
        #self.linear2 = nn.Linear(128, vocab_size)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        #print("FORWARD CALL!")
        #print("INPUTS SHAPE: ", inputs.shape)
        #embeds = self.embeddings(inputs).view((inputs.size(0), -1))
        
        #HERE
        #embeds = self.embeddings(inputs).view((1, -1))
        
        #print("Embeds type: ", type(embeds), type(self.embeddings(inputs)))
        #print("Embeds shape / size: ", str(embeds.shape))
        #print("Embeds: ", str(embeds))
        #print(type(embeds))
        # -1 implies size inferred for that index from the size of the data
        #print(np.mean(np.mean(self.linear2.weight.data.numpy())))
        #print("First layer output shape: ", str(self.linear1(embeds)))
        #out1 = F.relu()
        
        #HERE
        #out1 = F.relu(self.linear1(embeds)) # output of first layer
        
        #print("Out1: ", str(out1))
        
        #HERE
        #out2 = self.linear2(out1)           # output of second layer
        
        #print(embeds)
        
        #HERE
        #log_probs = F.log_softmax(out2, dim=1)
        #return log_probs
        
        #return out1
        out = self.embeddings(inputs).view(1, -1)
        out = out.view(1,-1)
        out = self.linear1(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        return out
    
    def predict(self, input):
        context_idxs = torch.tensor([word_to_ix[w] for w in input], dtype=torch.long)
        res = self.forward(context_idxs)
        res_arg = torch.argmax(res)
        res_val, res_ind = res.sort(descending=True)
        res_val = res_val[0][:5]
        res_ind = res_ind[0][:5]
        #print(res_val)
        #print(res_ind)
        for arg in zip(res_val,res_ind):
            #print(arg)
            print([(key,val,arg[0]) for key,val in word_to_ix.items() if val == arg[1]])


# Dataset class pytorch

# Dataloader class pytorch

# Loading data
Here the two datasets gets downloaded directly from Google Drive.

In [12]:
### URLS
orig_url_ta = 'https://drive.google.com/file/d/1ihP1HZ8YHVGGIEp1RHxXdt3PPIi12xvL/view?usp=sharing'
orig_url_scifi = "https://drive.google.com/file/d/10ehW4jZND3QA29v9aNboYUett5-swuNe/view?usp=sharing"

In [14]:
### DataLoaders
TravAdvDataSetLoader = Downloader(orig_url_ta)
ScifiLoader = Downloader(orig_url_scifi)

In [15]:
### CSV and txts
df_ta = TravAdvDataSetLoader.load_csv()
scifi_txt = ScifiLoader.load_txt()

# Exploratory Data Analysis

In [17]:
len(df_ta)

20491

In [26]:
scifi_df = scifi_txt.split(".")

Sci-fi story gets turned into a dataframe to allow a more proper cleaning.

In [27]:
scifi_dict = {"Text": [scifi_txt]}
scifi_df = pd.DataFrame.from_dict(scifi_dict)

**Observations**:


1.   Some reveiw include the rating (i.e. 4*). This should be removed
2.   The last line has a typo (and probably many other lines too) which add noise. A correction of all errors, however, is not realistic.

Now, we look for all characters used in the reviews to get an idea of how we need to preprocess the data. We can see that there are no foreign language characters in the data but a couple of symbols, special characters and emojis.

# Data Preprocessing
## Data Cleaning
For the data preprocessing, we first create a class that helps us to clean the data (following the OOP approach).

**Note**: We only perform operations on the complete data set (training + test set) which do not lead to information leakage. Removing certain characters from the test set is a valid operation that also occurs in real world setting. Data is usually preprocessed before predictions are made.
After cleaning the data in the step below, we compare the results of the second review to confirm that the cleaning was successful.<br>
We realize that most comments contain typos and that some typos like "did n't" result in single characters in the corpus. Given that we cannot correct every typo, we accept this noise in our data. 

In [22]:
DtCleaner = DataCleaner()

df_ta_cl = DtCleaner.remove_nonalpha_chars(df_ta, 'Review')
df_ta_cl = DtCleaner.lower_casing(df_ta_cl, 'Review')
df_ta_cl = DtCleaner.remove_stopwords(df_ta_cl, 'Review')

HBox(children=(FloatProgress(value=0.0, max=20491.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20491.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20491.0), HTML(value='')))




In [23]:
df_scifi_cl = DtCleaner.remove_nonalpha_chars(scifi_df, "Text")
df_scifi_cl = DtCleaner.lower_casing(df_scifi_cl, "Text")
df_scifi_cl = DtCleaner.remove_stopwords(df_scifi_cl, "Text")

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




## Training/Test Data Set Preparation

In [32]:
# splitting the data into a training and test set
# using set seed to allow replication
np.random.seed(123)
m = np.random.rand(len(df_ta_cl)) < 0.7
n = np.random.rand(len(df_scifi_cl)) < 0.7
df_ta_train = df_ta_cl[m]
df_ta_test = df_ta_cl[~m]

In [33]:
# just checking if split was correct
len(df_ta_train) / len(df_ta)

0.7005026597042604

In [34]:
corpus_ta_train = df_ta_train['Review'].str.cat(sep=', ')
corpus_ta_test =  df_ta_test['Review'].str.cat(sep=', ')

corpus_ta = corpus_ta_train + corpus_ta_test
len(corpus_ta.split())

2006076

In [35]:
corpus_scifi = df_scifi_cl["Text"].iloc[0]