In [143]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import re
import contractions
import nltk
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
import torch
import torch.nn.functional as F
import torch.nn as nn
from collections import defaultdict
nltk.download('wordnet',quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt',quiet=True)
 

True

In [5]:
reviews = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz"
data = pd.read_csv(reviews, sep='\t', on_bad_lines='skip', low_memory=False)
data.dropna(inplace=True)

In [6]:
warnings.filterwarnings("ignore")

review_data = data[['star_rating', 'review_body']]
review_data.dropna(inplace=True)
review_data['star_rating'] = review_data['star_rating'].astype('int32')

for index, row in review_data.iterrows():
    if row['star_rating'] in {1,2}:
        review_data.loc[index, 'star_rating'] = 1
    elif row['star_rating'] in {3}:
        review_data.loc[index, 'star_rating'] = 2
    elif row['star_rating'] in {4,5}:
        review_data.loc[index, 'star_rating'] = 3
        
# bal_data = review_data.groupby('star_rating').apply(lambda group: group.sample(20000)).reset_index(drop = True)
bal_data = review_data

In [7]:
bal_data.to_csv('data_full', sep='\t', header=False)

### 1. Dataset Generation

In [60]:
bal_data = pd.read_csv("data_full", sep='\t', names = ['star_rating', 'review_body'])

### Data Cleaning

In [8]:
stop_words = set(stopwords.words('english'))
bal_data = review_data
def remove_stopword(text):
    text_tokens = word_tokenize(text)
    tokens_without_sw =  " ".join([word for word in text_tokens if word not in stop_words])
    return tokens_without_sw

def lemmetize(text):
    lemmatizer = WordNetLemmatizer()
    text_tokens = word_tokenize(text)
    lemmatized_string = " ".join([lemmatizer.lemmatize(words) for words in text_tokens])

    return lemmatized_string

def clean_data(data):
    #covert to lower
    data= data.lower()
    #remove html and url
    data = re.sub(r'http\S+', '', data)
    urls = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    data = re.sub(urls, '', data)
    #remove non-alphabetical chars
    non_alpha = re.compile('[^a-zA-Z]')
    data = non_alpha.sub(' ', data)
    #remove extra spaces
    data = re.sub(' +', ' ', data)
    #perform contractions
    data = contractions.fix(data)  
    
    return data

bal_data['review_body'] = bal_data.apply(lambda row : clean_data(row['review_body']), axis = 1)
bal_data['review_body'] = bal_data.apply(lambda row : remove_stopword(row['review_body']), axis = 1)
bal_data['review_body'] = bal_data.apply(lambda row : lemmetize(row['review_body']), axis = 1)

In [9]:
bal_data.to_csv('data_cleaned', sep='\t', header=False)

In [8]:
bal_data = pd.read_csv("data_cleaned", sep='\t', names = ['star_rating', 'review_body'])
bal_data.dropna(inplace=True)
review_samples = []

for rating in [1,2,3]:
    rating_df = bal_data[ bal_data['star_rating'] == rating ]

    reviews = rating_df['review_body'].tolist()

    tfIdfVect = TfidfVectorizer(use_idf=True)
    # getting tf-idf vectors
    vect_review = tfIdfVect.fit_transform(rating_df['review_body'])

    # take sum of tf-idf vector values -> indicates "total" importance of the review in the pool of other reviews 
    # having the same rating
    vals = list(np.squeeze(np.asarray(np.sum(vect_review, axis = 1).astype(np.float32))))
    
    vals = [vals[i]/len(reviews[i]) for i in range(len(vals))]
    
    # sort by tf-idf sum value and take top 20000 reviews for each class
    rating_df = pd.DataFrame(list(zip(reviews, vals, rating_df['star_rating'].tolist())), 
                                    columns=['review_body','tfidf_score', 'star_rating'])

    rating_df = rating_df.sort_values(by=['tfidf_score'], ascending=False)
    review_samples.append(rating_df.head(20000))

In [24]:
bal_data = pd.concat(review_samples)

bal_data.drop(columns=['tfidf_score'], inplace=True)

bal_data = shuffle(bal_data)

In [25]:
bal_data.to_csv('balanced_data', sep='\t', header=False)

### 2. Word Embedding

#### (a)  Google news data

In [38]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [53]:
## generating word embedding
words = ["cat", "dog", "man", "woman"]
vectors = [wv[word] for word in words]

# print("Words and their Word embedding")
# print("------------------------------")
# for word, vector in zip(words, vectors):
#     print(f"Word: {word}  Vector: {vector}")
    

To learn the semantic similarity, I am considering the below three examples:
* Finding words similar to "happy" using its vector
* Performing "happy - smile + cry = sad" using word vector
* Finding cosine similarity between words "fight" and "battle"

In [8]:
print("3 similar words to happy:")
res = wv.similar_by_vector(wv["happy"], topn=5)
for word, score in res:
    print("\t{}: {:.4f}".format(word, score))
print()

cosine = wv.similarity("love", "like")
print("Cosine similarity between love and like:", cosine)
print()


big = wv['big']
large = wv['large']
small = wv['small']
result = big-large+small
similarity = wv.similar_by_vector(result)
print("Most similar words to 'big-large+small':")
for word, score in similarity:
    print("\t{}: {:.4f}".format(word, score))

3 similar words to happy:
	happy: 1.0000
	glad: 0.7409
	pleased: 0.6632
	ecstatic: 0.6627
	overjoyed: 0.6599

Cosine similarity between love and like: 0.36713877

Most similar words to 'big-large+small':
	big: 0.7968
	small: 0.6329
	bigger: 0.5330
	huge: 0.4986
	little_bitty: 0.4698
	biggest: 0.4613
	tiny: 0.4609
	Small: 0.4602
	nice: 0.4599
	abig: 0.4512


#### (b) Training Word2Vec on Review dataset

In [15]:
import gensim
from gensim.models import Word2Vec
r_data =bal_data['review_body']
sentences = []
for s in r_data:
    sentences.append(list(s.split(" ")))
    
my_model = Word2Vec(sentences, vector_size=300, window=13, min_count=9)
my_model.train(sentences, total_examples=my_model.corpus_count, epochs=my_model.epochs)

(10820456, 15495095)

In [21]:
print("3 similar words to happy:")
res = my_model.wv.similar_by_vector(wv["happy"], topn=5)
for word, score in res:
    print("\t{}: {:.4f}".format(word, score))
print()

cosine = my_model.wv.similarity("love", "like")
print("Cosine similarity between love and like:", cosine)
print()

3 similar words to happy:
	moved: 0.2399
	having: 0.2251
	owners: 0.2117
	being: 0.1966
	dropping: 0.1966

Cosine similarity between love and like: 0.3219804



### 3. Simple models

In [51]:
def findAccuracy(y_test, y_pred):
    report = classification_report(y_test, y_pred, output_dict=True, digits=4)
    df = pd.DataFrame(report).transpose()
    for i in range(3):
        print(f"Class {i+1} = {df['precision'][i-1]} , {df['recall'][i-1]} , {df['f1-score'][i-1]}")
    print(f"average = {df['precision'].mean()} , {df['recall'].mean()} , {df['f1-score'].mean()}")
    print("Accuracy = ", accuracy_score(y_test, y_pred))

#### Perceptron

In [39]:
from gensim.models import KeyedVectors
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

reviews = bal_data['review_body']
review_vectors = []
for review in reviews:
    words = review.split()
    review_vector = np.zeros(300)
    for word in words:
        if word in wv:
            review_vector += wv[word]
    review_vector /= len(words)
    review_vectors.append(review_vector)
review_vectors = np.array(review_vectors)

ratings = bal_data['star_rating']
train_data = review_vectors[:int(0.8 * len(review_vectors))]
train_labels = ratings[:int(0.8 * len(review_vectors))]
test_data = review_vectors[int(0.8 * len(review_vectors)):]
test_labels =  ratings[int(0.8 * len(review_vectors)):]
train_data=np.nan_to_num(train_data, copy=True, nan=0.0, posinf=None, neginf=None)
test_data=np.nan_to_num(test_data,copy=True, nan=0.0, posinf=None, neginf=None)

In [96]:
perceptron = Perceptron(max_iter=9000)
perceptron.fit(train_data, train_labels)
perceptron_predictions = perceptron.predict(test_data)
findAccuracy(test_labels, perceptron_predictions)


Class 1 = 0.6291793242757951 , 0.6139166666666667 , 0.6170392014900615
Class 2 = 0.6963562753036437 , 0.599601593625498 , 0.6443671394166444
Class 3 = 0.5105409705648369 , 0.6412690482138396 , 0.5684863248809655
average = 0.6266499972244798 , 0.6139110441353318 , 0.6165250133803749
Accuracy =  0.6139166666666667


In [29]:
svm = SVC(C=0.1)
svm.fit(train_data, train_labels)
svm_predictions = svm.predict(test_data)
findAccuracy(test_labels, svm_predictions)

Class 1 = 0.7115481186378178 , 0.7074166666666667 , 0.6977263581801021
Class 2 = 0.7105393369619001 , 0.7269045811187042 , 0.7186288002001752
Class 3 = 0.732420429311621 , 0.4978616352201258 , 0.5927811891568069
average = 0.7109654877052217 , 0.7064157974889214 , 0.6989149343114422


### 4. Feedforward Neural Networks

In [36]:
class Feedforward_MLP(torch.nn.Module):
        def __init__(self, input_size):
            super(Feedforward_MLP, self).__init__()
            self.input_size = input_size
            
            self.fc1 = torch.nn.Linear(self.input_size, 50)
            self.dropout = torch.nn.Dropout(0.2)
            
#             self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(50, 10)
            self.fc3 = torch.nn.Linear(10, 3)
#             self.sigmoid = torch.nn.Sigmoid()
            self.softmax = torch.nn.Softmax(dim=1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.dropout(x)
            x = F.relu(self.fc2(x))
            x = self.dropout(x)
            x = self.fc3(x)
            return x
        
mlp_model = Feedforward_MLP(300)
print(mlp_model)


Feedforward_MLP(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)


In [88]:
test_only = pd.read_csv("data_cleaned", sep='\t', names = ['star_rating', 'review_body'])
test_only.dropna(inplace=True)
test_60 = test_only.groupby('star_rating').apply(lambda group: group.sample(20000)).reset_index(drop = True)
test_60 = shuffle(test_60)
vocab = defaultdict(int)
for l in test_60['review_body']:
    for w in l.split():
        vocab[w] += 1 
        
for key in list(vocab.keys()):
    if vocab[key] < 3:
        del vocab[key]

test_reviews = test_60['review_body']
rvs = []
for review in test_reviews:
    words = review.split()
    rv = np.zeros(300)
    for word in words:
        if word in wv and word in vocab:
            rv += wv[word]
    rv /= len(words)
    rvs.append(rv)
rvs = np.array(rvs)

test_ratings = test_60['star_rating']
tr_d = rvs[:int(0.8 * len(rvs))]
tr_l = test_ratings[:int(0.8 * len(rvs))]
ts_d = rvs[int(0.8 * len(rvs)):]
ts_l =  test_ratings[int(0.8 * len(rvs)):]
tr_d=np.nan_to_num(tr_d, copy=True, nan=0.0, posinf=None, neginf=None)
ts_d=np.nan_to_num(ts_d,copy=True, nan=0.0, posinf=None, neginf=None)

In [99]:
n_epochs = 100
batch_size= 15
n_classes = 3
input_size = 300
criterion = torch.nn.CrossEntropyLoss()

train_label_0 = [x-1 for x in tr_l]
test_label_0 = [x-1 for x in ts_l]


x_train = torch.Tensor(tr_d)
y_train = torch.Tensor(train_label_0).type(torch.LongTensor)
x_cv = torch.Tensor(ts_d)
y_cv = torch.Tensor(test_label_0).type(torch.LongTensor)

# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
test = torch.utils.data.TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size,  sampler = sampler,shuffle=False)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.Adam(mlp_model.parameters(),lr=0.005)
test_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    test_loss = 0.0
    
    ###################
    # train the model #
    ###################
    mlp_model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = mlp_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    mlp_model.eval() # prep model for evaluation
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = mlp_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        test_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    test_loss = test_loss/len(test_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    # save model if validation loss has decreased
    if test_loss <= test_loss_min:
        print('Test loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        test_loss_min,
        test_loss))
        torch.save(mlp_model.state_dict(), 'mlp_model.pt')
        test_loss_min = test_loss

Epoch: 1 	Training Loss: 0.825234 	Test Loss: 1.462071
Test loss decreased (inf --> 1.462071).  Saving model ...
Epoch: 2 	Training Loss: 0.830298 	Test Loss: 1.334126
Test loss decreased (1.462071 --> 1.334126).  Saving model ...
Epoch: 3 	Training Loss: 0.821584 	Test Loss: 1.374535
Epoch: 4 	Training Loss: 0.822289 	Test Loss: 1.213783
Test loss decreased (1.334126 --> 1.213783).  Saving model ...
Epoch: 5 	Training Loss: 0.820358 	Test Loss: 1.153229
Test loss decreased (1.213783 --> 1.153229).  Saving model ...
Epoch: 6 	Training Loss: 0.819128 	Test Loss: 1.040245
Test loss decreased (1.153229 --> 1.040245).  Saving model ...
Epoch: 7 	Training Loss: 0.816706 	Test Loss: 1.105544
Epoch: 8 	Training Loss: 0.813357 	Test Loss: 1.209472
Epoch: 9 	Training Loss: 0.816107 	Test Loss: 1.178686
Epoch: 10 	Training Loss: 0.814381 	Test Loss: 1.147119
Epoch: 11 	Training Loss: 0.816718 	Test Loss: 1.093866
Epoch: 12 	Training Loss: 0.818749 	Test Loss: 1.155726
Epoch: 13 	Training Loss: 0

In [100]:
y_pred_test=torch.max(mlp_model(x_cv).data,1).indices
findAccuracy(y_cv.tolist(), y_pred_test.tolist())

Class 1 = 0.6226853177888603 , 0.6250833333333333 , 0.6173578675732784
Class 2 = 0.5997079682937004 , 0.7282168186423505 , 0.657744223289865
Class 3 = 0.5912954777286638 , 0.43044554455445544 , 0.49820942558372733
average = 0.6230522167545917 , 0.6257507976706318 , 0.6189733992899935
Accuracy =  0.6250833333333333


#### (b) Input feature as concatenated vectors 

In [113]:
class Concatenate_MLP(torch.nn.Module):
        def __init__(self, input_size):
            super(Concatenate_MLP, self).__init__()
            self.input_size = input_size            
            self.fc1 = torch.nn.Linear(self.input_size, 50)
            self.dropout = torch.nn.Dropout(0.2)
            self.fc2 = torch.nn.Linear(50, 10)
            self.fc3 = torch.nn.Linear(10, 3)
            self.softmax = torch.nn.Softmax(dim=1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.dropout(x)
            x = F.relu(self.fc2(x))
            x = self.dropout(x)
            x = self.fc3(x)
            x= self.softmax(x)
            return x
        
concatenated_model = Concatenate_MLP(3000)
print(concatenated_model)

Concatenate_MLP(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)


In [102]:
def generate_input_feature_vector(reviews):
    review_words = [review.split() for review in reviews]    
    vector_size = wv.vector_size    
    num_reviews = len(reviews)
    input_features = np.zeros((num_reviews, 10*vector_size))
    
    for i, words in enumerate(review_words):
        vectors = []
        for j in range(min(len(words), 10)):
            word = words[j]
            if word in wv:
                vectors.append(wv[word])
            else:
                pass
        
        num_missing_vectors = max(0, 10 - len(vectors))
        padded_vectors = vectors + [np.zeros(vector_size)]*num_missing_vectors        
        feature_vector = np.concatenate(padded_vectors)        
        input_features[i,:] = feature_vector
    
    return input_features

input_feature = generate_input_feature_vector(test_60['review_body'])
input_labels = test_60['star_rating']

train_data_concat = input_feature[:int(0.8 * len(input_feature))]
train_label_concat = input_labels[:int(0.8 * len(input_labels))]
test_data_concat = input_feature[int(0.8 * len(input_feature)):]
test_label_concat =  input_labels[int(0.8 * len(input_labels)):]

In [114]:
n_epochs = 100
batch_size= 15
n_classes = 3
input_size = 3000
criterion = torch.nn.CrossEntropyLoss()

train_label_0 = [x-1 for x in train_label_concat]
test_label_0 = [x-1 for x in test_label_concat]


x_train = torch.Tensor(train_data_concat)
y_train = torch.Tensor(train_label_0).type(torch.LongTensor)
x_cv = torch.Tensor(test_data_concat)
y_cv = torch.Tensor(test_label_0).type(torch.LongTensor)

# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
test = torch.utils.data.TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size,  sampler = sampler,shuffle=False)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.Adam(concatenated_model.parameters(),lr=0.005,betas=(0.9,0.999),eps=1e-08,weight_decay=0,amsgrad=False)
test_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    test_loss = 0.0
    
    ###################
    # train the model #
    ###################
    concatenated_model.train() # prep model for training
    for data, target in train_loader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = concatenated_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    concatenated_model.eval() # prep model for evaluation
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = concatenated_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        test_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    test_loss = test_loss/len(test_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    # save model if validation loss has decreased
    if test_loss <= test_loss_min:
        print('Test loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        test_loss_min,
        test_loss))
        torch.save(concatenated_model.state_dict(), 'concatenated_model.pt')
        test_loss_min = test_loss


Epoch: 1 	Training Loss: 0.981438 	Test Loss: 0.969641
Test loss decreased (inf --> 0.969641).  Saving model ...
Epoch: 2 	Training Loss: 0.943694 	Test Loss: 0.966551
Test loss decreased (0.969641 --> 0.966551).  Saving model ...
Epoch: 3 	Training Loss: 0.933607 	Test Loss: 0.969582
Epoch: 4 	Training Loss: 0.919472 	Test Loss: 0.978174
Epoch: 5 	Training Loss: 0.909904 	Test Loss: 0.975660
Epoch: 6 	Training Loss: 0.902423 	Test Loss: 0.975728
Epoch: 7 	Training Loss: 0.894346 	Test Loss: 0.980218
Epoch: 8 	Training Loss: 0.894622 	Test Loss: 0.981943
Epoch: 9 	Training Loss: 0.891956 	Test Loss: 0.983494
Epoch: 10 	Training Loss: 0.884797 	Test Loss: 0.980240
Epoch: 11 	Training Loss: 0.880723 	Test Loss: 0.984680
Epoch: 12 	Training Loss: 0.880338 	Test Loss: 0.984363
Epoch: 13 	Training Loss: 0.876303 	Test Loss: 0.984842
Epoch: 14 	Training Loss: 0.875385 	Test Loss: 0.982481
Epoch: 15 	Training Loss: 0.874550 	Test Loss: 0.986569
Epoch: 16 	Training Loss: 0.872302 	Test Loss: 0

In [115]:
y_pred_test=torch.max(concatenated_model(x_cv).data,1).indices
findAccuracy(y_cv.tolist(), y_pred_test.tolist())

Class 1 = 0.5539093719354284 , 0.5538333333333333 , 0.5520550539586373
Class 2 = 0.5922244957614733 , 0.5131712259371833 , 0.5498710815578776
Class 3 = 0.4859067099027189 , 0.48217821782178216 , 0.48403528388619704
average = 0.5541379884363744 , 0.5538003195059803 , 0.5524491839003034
Accuracy =  0.5538333333333333


### 5. Recurrent Neural Networks

In [142]:
from torch.nn.utils.rnn import pad_sequence
def generate_input_vec_20(reviews):
    sequences = []
    for review in reviews:
        text_tokens = word_tokenize(review)        
        vectors=[]
        for word in text_tokens:
            if word in wv and word in vocab:
                vectors.append(torch.tensor(wv[word], dtype=torch.float32))
        
        if(len(vectors)>=20):
            sequences.append(vectors[:20])
        else:
            missing_nums = 20-len(vectors)
            for i in range(missing_nums):
                 vectors.append(np.zeros((1,300)))
            sequences.append(vectors[:])
    return sequences
            
input_feature = generate_input_vec_20(test_60['review_body'])
input_labels = test_60['star_rating']

train_data_rnn = input_feature[:int(0.8 * len(input_feature))]
train_label_rnn = input_labels[:int(0.8 * len(input_labels))]
test_data_rnn = input_feature[int(0.8 * len(input_feature)):]
test_label_rnn =  input_labels[int(0.8 * len(input_labels)):]

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i
    
rnn_model = RNN(300,20,3)


In [140]:
vec= []
vec.append(wv['good'])
vec.append(wv['bad'])
vec[0].reshape((1,300))
vec

[array([ 0.04052734,  0.0625    , -0.01745605,  0.07861328,  0.03271484,
        -0.01263428,  0.00964355,  0.12353516, -0.02148438,  0.15234375,
        -0.05834961, -0.10644531,  0.02124023,  0.13574219, -0.13183594,
         0.17675781,  0.27148438,  0.13769531, -0.17382812, -0.14160156,
        -0.03076172,  0.19628906, -0.03295898,  0.125     ,  0.25390625,
         0.12695312, -0.15234375,  0.03198242,  0.01135254, -0.01361084,
        -0.12890625,  0.01019287,  0.23925781, -0.08447266,  0.140625  ,
         0.13085938, -0.04516602,  0.06494141,  0.02539062,  0.05615234,
         0.24609375, -0.20507812,  0.23632812, -0.00860596, -0.02294922,
         0.05078125,  0.10644531, -0.03564453,  0.08740234, -0.05712891,
         0.08496094,  0.23535156, -0.10107422, -0.03564453, -0.04736328,
         0.04736328, -0.14550781, -0.10986328,  0.14746094, -0.23242188,
        -0.07275391,  0.19628906, -0.37890625, -0.07226562,  0.04833984,
         0.11914062,  0.06103516, -0.12109375, -0.2