In [2]:
# Import dependecies
import pandas as pd
import spacy
import torch 
import torchtext  ## for downloading the data from pytorch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.dataloader import default_collate
from torch.utils.data.dataset import random_split
from torch.utils.data.dataset import ConcatDataset
from torch.utils.data.dataset import Subset

# Process
## Get the data
## Preprocess the data
## FastText embedding
## Create a Dataloader
## Create a model

In [None]:
# Download the data from pytorch
# torchtext.dataset.AmazonReviewFull(root='./data/', split=("train", "test"))

#### for demo purposes use a subset of the test set


In [3]:
df = pd.read_csv('.data/amazon_review_full_csv/train.csv', nrows= 4000, header=None)

In [4]:
df

Unnamed: 0,0,1,2
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...
...,...,...,...
3995,2,Horrible.,I have had the first DVD since I bought my fir...
3996,2,Cheap!,I haven't been able to watch this much because...
3997,5,More Excellent Instruction from the Hoopnotica...,"As with their first dvd, the instruction in th..."
3998,4,Great book.,"As a former teenage girl, I decided this book ..."


In [5]:
df[0].unique()

array([3, 5, 4, 1, 2], dtype=int64)

In [6]:
df.rename({0: 'ratings', 1: 'review_title',2:"review"}, axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,ratings,review_title,review
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [8]:
df["reviews"] = df["review_title"] + " " + df["review"]

In [9]:
df.head()

Unnamed: 0,ratings,review_title,review,reviews
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...,more like funchuck Gave this to my dad for a g...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...,Too good to be true Probably the greatest soun...


In [10]:
df.drop(["review_title", "review"], axis=1, inplace=True)

In [11]:
df.ratings = df.ratings.apply(lambda x: int(x) - 1)

In [12]:
df.ratings.unique()

array([2, 4, 3, 0, 1], dtype=int64)

In [13]:
type(df.reviews.iloc[5])

str

In [14]:
nlp = spacy.load('en_core_web_sm')


# preprocessing


In [15]:
def preprocessing(sentences):
    """ perform : tokenization & lemmatization, removes stopwords & punctuations, lower cases"""

    doc = nlp(sentences)
    # tokenization, we return the tokens that aren't stopwords or punctuations
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return tokens

In [16]:
txt = df.reviews.iloc[0]
txt

'more like funchuck Gave this to my dad for a gag gift after directing "Nunsense," he got a reall kick out of it!'

In [None]:
preprocessing(txt)

# Encoder - Fastext 

In [17]:
from torchtext.vocab import FastText  # Glove, FastText, Word2Vec


In [18]:
fasttext = FastText("simple") 

In [19]:
fasttext.dim

300

In [20]:

def token_encoder(token, vec):

    """ encodes a single word"""
    if token == "<pad>":
        return 1
    else:
        try:
            return vec.stoi[token]  # if the token is in the vocabulary, string to index
        except:
            if type(token) != str:
                print("Error, we need a word which is in string format")
            else:
                return 0  # if the token is not in the vocabulary, return 0
                

In [None]:
token_encoder("Paramveer", fasttext)

In [None]:
fasttext.itos[2610], fasttext.stoi["hello"]

In [21]:
def encoder(tokens, voc):
    """ encodes a list of tokens"""
    return [token_encoder(token, voc) for token in tokens]


In [None]:
preprocessing(txt)

In [None]:
txt = "Python is kicking me and, so is Paramveer "
encoder(preprocessing(txt),fasttext)

In [None]:
fasttext.itos[66032]

In [None]:
fasttext.itos[0], fasttext.itos[1]

In [22]:
def padding(list_of_indexes, max_seq_len, padding_index = 1):
    output = list_of_indexes + (max_seq_len - len(list_of_indexes)) * [padding_index]
    return output[:max_seq_len]

In [None]:
list_of_indexes = encoder(preprocessing(txt),fasttext)
list_of_indexes

In [None]:
padding(list_of_indexes, 10)

In [None]:
ntxt = "Python is kicking me and, so is Paramveer Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer Python is kicking me and, so is Paramveer  "
list_of_indexes = encoder(preprocessing(ntxt),fasttext)
len(list_of_indexes)

In [None]:
padding(list_of_indexes, 10)

# Dataloader

In [23]:
class TrainData(Dataset):
    def __init__(self, df, max_seq_len =32):
        self.max_seq_len = max_seq_len
        train_iter =iter(df.reviews.values)
        self.vec = FastText("simple")
        self.vec.vectors[1] = -torch.ones(self.vec.vectors[1].shape[0])
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0])
        self.vectorize = lambda x: self.vec.vectors[x]
        self.labels = df.ratings.values
        self.sequences =[padding(encoder(preprocessing(sequence),fasttext),max_seq_len=32) for sequence in train_iter]

    def __len__(self):
        return len(self.sequences)
           
    def __getitem__(self, idx):
        assert len(self.sequences[idx]) == self.max_seq_len
        return self.sequences[idx], self.labels[idx]  #idx so that we dont flood the memory with tensors 

        
   

In [None]:
#[padding(encoder(preprocessing(sequence),fasttext),max_seq_len=32) for sequence in df.reviews.values[:10]]

In [None]:
fasttext.vectors[7856].shape

In [25]:
dataset = TrainData(df[:20])

In [26]:
dataset.vectorize(0).shape

torch.Size([300])

In [27]:
iter(df.reviews.values)

<iterator at 0x24f5c161040>

In [28]:
def collate(batch, vectorizer = dataset.vectorize):
    # batch of sentence
    # from these sentences we want the tokens, for each sentence
    # we want to get the tokens, and then we want to get the embeddings
    # we need to pass vectorizer
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])  
    targets = torch.LongTensor([item[1] for item in batch])

    return inputs, targets



In [37]:
batch_size = 16
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate)
train_iter = iter(train_loader)
sentence, target = next(train_iter)
sentence.shape

torch.Size([16, 32, 300])

# Model 

In [33]:
emb_dim = fasttext.dim 

class Classifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim,hidden_dim1=16, hidden_dim2=16):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(max_seq_len * emb_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, 1)
        self.out = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return self.out(x)
        


In [34]:
max_seq_len = 32
model = Classifier(max_seq_len, emb_dim)
model

Classifier(
  (fc1): Linear(in_features=9600, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (out): LogSoftmax(dim=1)
)

In [35]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
epochs = 10
print_every = 100

for e in range(epochs):
    running_loss = 0
    for i, (sentence, labels) in enumerate(train_loader): 
        sentence.resize_(sentence.size()[0], max_seq_len*emb_dim)
        optimizer.zero_grad()
        output = model.forward(sentence)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if i % print_every == 0:
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Loss: {:.4f}".format(running_loss/print_every))
            running_loss = 0