In [1]:
import pandas as pd
import numpy as np
### read the data 
fields = ['star_rating', 'review_body']

test = pd.read_csv("amazon_reviews_us_Kitchen_v1_00.tsv", sep = '\t',usecols=fields,error_bad_lines = False)

In [2]:
### keep the balanced dataset 
### sample 50000 per star_rating from the data
sample_size = 50000 ### here we use 10000 for testing 
df = test.groupby('star_rating').apply(lambda x: x.sample(sample_size))

In [3]:
df["label"] = df.star_rating.apply(lambda x: 1 if x > 3.0 else( 2 if x < 3.0 else 3))

In [5]:
df =df.rename(columns ={"review_body": "review"})

In [6]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,star_rating,review,label
star_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,481816,1.0,The reservoir is tiny and doesn't grind well ...,2
1.0,3238056,1.0,"Great in theory, but it's disappointing when o...",2
1.0,1009774,1.0,Received the incorrect item TWICE. How does t...,2
1.0,4474590,1.0,I'm with M. Mohrman on this one. I just bough...,2
1.0,1060705,1.0,These are not worth the money I got 3! I used ...,2
...,...,...,...,...
5.0,996218,5.0,Love them!!! WOWSA are they GREAT!! We have us...,1
5.0,2148995,5.0,Very good product,1
5.0,3668613,5.0,A beautiful serving set for the little princes...,1
5.0,1330342,5.0,I've gone through 3 or 4 different types of sh...,1


In [8]:
import re

df["review"] = df["review"].astype(str)

### Data cleaning and preprocssing 
df["review"] = df["review"].str.lower()

## remove HTML 
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
df["review"] = df["review"].apply(lambda x :remove_tags(x))

## remove URL
df["review"] = df["review"].str.replace(r's*https?://S+(s+|$)', ' ').str.strip()

def remove_extraS(text):
    return re.sub(' +', ' ', text)

df["review"] = df["review"].apply(lambda x: remove_extraS(x))

def contractionfunction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    return phrase

df["review"] = df["review"].apply(lambda x: contractionfunction(x))

## remove non-alphabetical characters
df["review"] = df["review"].str.replace('[^a-zA-Z]', ' ')

In [9]:
### preprocessing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

df['review'] = df['review'].apply(word_tokenize)
#df.head()

stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
from gensim.models import Word2Vec
model = Word2Vec.load("word2vec.model")

In [55]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,star_rating,review,label
star_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,481816,1.0,"[reservoir, tiny, grind, well, returned, good,...",2
1.0,3238056,1.0,"[great, theory, disappointing, one, two, order...",2
1.0,1009774,1.0,"[received, incorrect, item, twice, happen]",2
1.0,4474590,1.0,"[mohrman, one, bought, toaster, yesterday, use...",2
1.0,1060705,1.0,"[worth, money, got, used, one, worked, like, h...",2
...,...,...,...,...
5.0,996218,5.0,"[love, wowsa, great, used, transfer, meat, shr...",1
5.0,2148995,5.0,"[good, product]",1
5.0,3668613,5.0,"[beautiful, serving, set, little, princess, fa...",1
5.0,1330342,5.0,"[gone, different, types, shakers, one, mixes, ...",1


In [14]:
### discard the class 3 to get a binary class dataset
binary_df = df.loc[df["label"] <3]

In [15]:
def own_model_avg_vec(x):
    temp = np.zeros(300)
    n = 0
    for i in x:
        if i in model.wv:
            temp+=model.wv[i]
        else:
            n+=1
    return temp/(len(x)-n)


In [16]:
binary_df["own_vec"] = binary_df["review"].apply(lambda x: own_model_avg_vec(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
binary_df["y"] = binary_df["label"].apply(lambda x: 0 if x==1 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [58]:
binary_df

Unnamed: 0_level_0,Unnamed: 1_level_0,star_rating,review,label,own_vec,y
star_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,481816,1.0,"[reservoir, tiny, grind, well, returned, good,...",2,"[0.0499773443573051, 0.22326408699154854, -0.0...",1
1.0,3238056,1.0,"[great, theory, disappointing, one, two, order...",2,"[-0.05490862252190709, 0.1216887488650779, 0.0...",1
1.0,1009774,1.0,"[received, incorrect, item, twice, happen]",2,"[-0.16803593933582306, 0.21726865097880363, 0....",1
1.0,4474590,1.0,"[mohrman, one, bought, toaster, yesterday, use...",2,"[-0.07272139545895305, 0.1282805115413651, 0.0...",1
1.0,1060705,1.0,"[worth, money, got, used, one, worked, like, h...",2,"[-0.012075881848507683, 0.18668859323952347, 0...",1
...,...,...,...,...,...,...
5.0,996218,5.0,"[love, wowsa, great, used, transfer, meat, shr...",1,"[-0.01297765592303635, 0.10272096052596515, -0...",0
5.0,2148995,5.0,"[good, product]",1,"[0.14246436953544617, 0.18519844859838486, -0....",0
5.0,3668613,5.0,"[beautiful, serving, set, little, princess, fa...",1,"[0.05817469548700111, 0.121769632745002, -0.02...",0
5.0,1330342,5.0,"[gone, different, types, shakers, one, mixes, ...",1,"[0.012344609099355612, 0.23621410944245078, 0....",0


In [59]:
from sklearn.model_selection import train_test_split
X = binary_df['own_vec']
y = binary_df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [60]:
### convert a sequence to a array numpy
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())
y_train = np.array(y_train.tolist())
y_test = np.array(y_test.tolist())
### convert any row with nan to num 
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)
y_train = np.nan_to_num(y_train)
y_test = np.nan_to_num(y_test)

In [61]:
import torch.nn as nn
import torch.nn.functional as F

# define the NN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # number of hidden nodes in each layer 
        hidden_1 = 50
        hidden_2 = 10
        # linear layer (300 -> hidden_1)
        self.fc1 = nn.Linear(300, hidden_1)
        # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        # linear layer (n_hidden -> 10)
        self.fc3 = nn.Linear(hidden_2, 2)
        # dropout layer (p=0.2)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # flatten image input
        x = x.view(-1, 300)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer
        x = self.dropout(x)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x

# initialize the NN
model = Net()
print(model)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [62]:
# import libraries
import torch
## Specify loss and optimization functions

# specify loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [50]:
a = np.array((0,1,2,3))


In [63]:
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import TensorDataset, DataLoader
#import torchvision
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20
# percentage of training set to use as validation
#valid_size = 0.2

DatasetTrain = TensorDataset(torch.from_numpy(X_train),torch.from_numpy(y_train))
        
DatasetTest=TensorDataset(torch.from_numpy(X_test),torch.from_numpy(y_test))

trainloader=torch.utils.data.DataLoader(DatasetTrain,batch_size=batch_size,shuffle=True,drop_last=True, num_workers=0)

validationloader=torch.utils.data.DataLoader(DatasetTest, batch_size=batch_size, drop_last=True,num_workers=0)


In [65]:
# number of epochs to train the model
n_epochs = 10  # suggest training between 20-50 epochs

model.train() # prep model for training

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    for data, target in trainloader:
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data.float())
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    # print training statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(trainloader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss
        ))

Epoch: 1 	Training Loss: 0.403504
Epoch: 2 	Training Loss: 0.386184
Epoch: 3 	Training Loss: 0.377329
Epoch: 4 	Training Loss: 0.370270
Epoch: 5 	Training Loss: 0.365754
Epoch: 6 	Training Loss: 0.361187
Epoch: 7 	Training Loss: 0.358048
Epoch: 8 	Training Loss: 0.354475
Epoch: 9 	Training Loss: 0.350900
Epoch: 10 	Training Loss: 0.348782


In [67]:
# initialize lists to monitor test loss and accuracy
test_loss = 0.0
class_correct = list(0. for i in range(2))
class_total = list(0. for i in range(2))

model.eval() # prep model for *evaluation*

for data, target in validationloader:
    # forward pass: compute predicted outputs by passing inputs to the model
    output = model(data.float())
    # calculate the loss
    loss = criterion(output, target)
    # update test loss 
    test_loss += loss.item()*data.size(0)
    # convert output probabilities to predicted class
    _, pred = torch.max(output, 1)
    # compare predictions to true label
    correct = np.squeeze(pred.eq(target.data.view_as(pred)))
    # calculate test accuracy for each object class
    for i in range(batch_size):
        label = target.data[i]
        class_correct[label] += correct[i].item()
        class_total[label] += 1

# calculate and print avg test loss
test_loss = test_loss/len(validationloader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))

for i in range(10):
    if class_total[i] > 0:
        print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
            str(i), 100 * class_correct[i] / class_total[i],
            np.sum(class_correct[i]), np.sum(class_total[i])))
    else:
        print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))

print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
    100. * np.sum(class_correct) / np.sum(class_total),
    np.sum(class_correct), np.sum(class_total)))

Test Loss: 0.335874

Test Accuracy of     0: 89% (17888/20007)
Test Accuracy of     1: 81% (16321/19993)


IndexError: list index out of range