# Introduction

In [1]:
import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import os
import nltk                                         #Natural language processing tool-kit

In [2]:
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings
from gensim.models import Word2Vec  

In [5]:
data_path = "C:/Users/Sagnik_laptop/Documents/ML/ML1010/ML1010-A1/data/Reviews.csv"
data = pd.read_csv(data_path)
data_sel = data.head(10000)                            #Considering only top 10000 rows

In [6]:
data_sel.columns                                     #dataset column names

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [7]:
data_score_removed = data_sel[data_sel['Score']!=3]        #removing neutral reviews having score = 3

# Converting Score values into class label either Positive or Negative.

In [8]:
def partition(x):
    if x < 3:
        return 'positive'
    return 'negative'

score_upd = data_score_removed['Score']
t = score_upd.map(partition)
data_score_removed['Score']=t

# 1.Basic Cleaning-removing duplicates

In [31]:
final_data = data_score_removed.drop_duplicates(subset={"UserId","ProfileName","Time","Text"})
final = final_data[final_data['HelpfulnessNumerator'] <= final_data['HelpfulnessDenominator']]
summary_text = final['Text']
# text data
corpus = np.array(summary_text)
final_y = final['Score']
# print current data set
corpus

array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
    

## Text pre-processing

In [27]:
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')

In [33]:
# from nltk.stem import PorterStemmer                 # Stemmer
# import re
# temp =[]
# snow = nltk.stem.SnowballStemmer('english')
# for sentence in final_X:
#     sentence = sentence.lower()                 # Converting to lowercase
#     cleanr = re.compile('<.*?>')
#     sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
#     sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
#     sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)        #Removing Punctuations
    
#     words = [snow.stem(word) for word in sentence.split() if word not in stopwords.words('english')]   # Stemming and removing stopwords
#     temp.append(words)
    
# final_X = temp 
wpt = nltk.WordPunctTokenizer()
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    # covert to lower case
    doc = doc.lower()
    doc = doc.strip()
    cleanr = re.compile('<.*?>')
    # remove html tags
    doc = re.sub(cleanr, ' ', doc)
    #remove punctuations
    doc = re.sub(r'[?|!|\'|"|#]',r'', doc)
    doc = re.sub(r'[.|,|)|(|\|/]',r' ', doc) 
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc
normalize_corpus = np.vectorize(normalize_document)


In [34]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better',
       'product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo',
       'confection around centuries light pillowy citrus gelatin nuts - case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story c lewis lion witch wardrobe - treat seduces edmund selling brother sisters witch',
       ...,
       'wanted solely breastfeed unable keep supplement formula chose similac reputable company ton hospital used powder ready made likes got little constipated beginning 5 months problems read reviews sucrose well sucrose sugar formula sugar companies label differently since ths one organic contain pure sugar far hexane goes research formulas except one babys use hex

In [25]:
# save the data
import dill
dill.dump_session('notebook_env.db')

# load the data
# dill.load_session('notebook_env.db')

# Bag of words

In [52]:
from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
cv = CountVectorizer()
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [58]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,00,000,0003,000kwh,002,008,0100,0174,02,03,...,zoo,zoom,zotz,zs,zucchini,zuke,zukes,zupas,zuppa,îts
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# Bag of n-grams model
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,00 10lb,00 12,00 15,00 22,00 24,00 30,00 47,00 49,00 50,00 51,...,zukes soft,zukes thing,zukes think,zukes top,zukes treats,zukes well,zukes win,zupas pathetic,zuppa engelesia,îts real
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
#Term Frequency - Inverse Document Frequency(TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,00,000,0003,000kwh,002,008,0100,0174,02,03,...,zoo,zoom,zotz,zs,zucchini,zuke,zukes,zupas,zuppa,îts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
import gensim
w2v_num_features = 500
w2v_model = gensim.models.Word2Vec(norm_corpus, size=w2v_num_features, window=150,
                                   min_count=10, sample=1e-3)

In [66]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [68]:
features = averaged_word2vec_vectorizer(corpus=corpus, model=w2v_model,
                                                     num_features=500)