In [None]:
# Packages to load
import pandas as pd # data processing, CSV file
import numpy as np # linear algebra



import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import sent_tokenize, word_tokenize, pos_tag

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score

from sklearn.model_selection import train_test_split, GridSearchCV
import logging
from gensim.models import word2vec
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors



# Read the csv file
pd.set_option('display.max_colwidth', 20)
data = pd.read_csv (r'C:\Users\sam\Documents\P1\amazon_labeldata_drop_3.csv')
#df = pd.DataFrame(data, columns= ['Reviews']) #to read review colomn only
print (data)

        Unnamed: 0  Rating              Reviews  Unbalance Dataset Label
0                0       5  I feel so LUCKY ...                    1    
1                1       4  nice phone, nice...                    1    
2                2       5         Very pleased                    1    
3                3       4  It works good bu...                    1    
4                4       4  Great phone to r...                    1    
...            ...     ...                  ...                  ...    
382010      413832       4  good rugged phon...                    1    
382011      413834       1            used hard                    0    
382012      413835       5  another great de...                    1    
382013      413837       5  Passes every dro...                    1    
382014      413839       4  Only downside is...                    1    

[382015 rows x 4 columns]


In [None]:
# Remove special characters and numericals
def clean(text):
    # Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text
data['Reviews']=data['Reviews'].apply(str)
data['Reviews_clean'] = data['Reviews'].apply(clean)
data[['Reviews','Reviews_clean']]
data.head(10)

Unnamed: 0.1,Unnamed: 0,Rating,Reviews,Unbalance Dataset Label,Reviews_clean
0,0,5,I feel so LUCKY ...,1,I feel so LUCKY ...
1,1,4,"nice phone, nice...",1,nice phone nice ...
2,2,5,Very pleased,1,Very pleased
3,3,4,It works good bu...,1,It works good bu...
4,4,4,Great phone to r...,1,Great phone to r...
5,5,1,I already had a ...,0,I already had a ...
6,6,2,The charging por...,0,The charging por...
7,7,2,Phone looks good...,0,Phone looks good...
8,8,5,I originally was...,1,I originally was...
9,11,5,This is a great ...,1,This is a great ...


In [None]:
## Remove Punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def remove_punctuation(txt):
    txt_nopunct = "".join([c for c in txt if c not in string.punctuation])
    return txt_nopunct

In [None]:
data['Reviews_punct'] = data['Reviews_clean'].apply(lambda x: remove_punctuation(x))
data[['Reviews','Reviews_clean','Reviews_punct']]

Unnamed: 0,Reviews,Reviews_clean,Reviews_punct
0,I feel so LUCKY ...,I feel so LUCKY ...,I feel so LUCKY ...
1,"nice phone, nice...",nice phone nice ...,nice phone nice ...
2,Very pleased,Very pleased,Very pleased
3,It works good bu...,It works good bu...,It works good bu...
4,Great phone to r...,Great phone to r...,Great phone to r...
...,...,...,...
382010,good rugged phon...,good rugged phon...,good rugged phon...
382011,used hard,used hard,used hard
382012,another great de...,another great de...,another great de...
382013,Passes every dro...,Passes every dro...,Passes every dro...


In [None]:
#Tokenization
import re
def tokenize(txt):
    tokens = re.split('\W+', txt)
    return tokens
data['clean_tokenizedlower']= data['Reviews_punct'].apply(lambda x: tokenize(x.lower()))
data[['Reviews','Reviews_punct', 'clean_tokenizedlower']]

Unnamed: 0,Reviews,Reviews_punct,clean_tokenizedlower
0,I feel so LUCKY ...,I feel so LUCKY ...,"[i, feel, so, lu..."
1,"nice phone, nice...",nice phone nice ...,"[nice, phone, ni..."
2,Very pleased,Very pleased,"[very, pleased]"
3,It works good bu...,It works good bu...,"[it, works, good..."
4,Great phone to r...,Great phone to r...,"[great, phone, t..."
...,...,...,...
382010,good rugged phon...,good rugged phon...,"[good, rugged, p..."
382011,used hard,used hard,"[used, hard]"
382012,another great de...,another great de...,"[another, great,..."
382013,Passes every dro...,Passes every dro...,"[passes, every, ..."


In [None]:
#Removing stop words
import nltk
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(txt_tokenized):
    txt_clean = [word for word in txt_tokenized if word not in stopwords]
    return txt_clean
data['Reviews_stopwords'] = data['clean_tokenizedlower'].apply(lambda x: remove_stopwords(x))
data[['clean_tokenizedlower', 'Reviews_stopwords']]

Unnamed: 0,clean_tokenizedlower,Reviews_stopwords
0,"[i, feel, so, lu...","[feel, lucky, fo..."
1,"[nice, phone, ni...","[nice, phone, ni..."
2,"[very, pleased]",[pleased]
3,"[it, works, good...","[works, good, go..."
4,"[great, phone, t...","[great, phone, r..."
...,...,...
382010,"[good, rugged, p...","[good, rugged, p..."
382011,"[used, hard]","[used, hard]"
382012,"[another, great,...","[another, great,..."
382013,"[passes, every, ...","[passes, every, ..."


In [None]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

data['Reviews_lemmatizer'] = data['Reviews_stopwords'].apply(lambda x :' '.join([lemmatizer.lemmatize(w) for w in x]))
#print(lemmatized_output)
#my_ub_data['Reviews_stop_word_clean'] = my_ub_data['Reviews_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#my_ub_data[['Reviews','Reviews_clean','Reviews_stop_word_clean']]
data[['Reviews','Reviews_lemmatizer']]

Unnamed: 0,Reviews,Reviews_lemmatizer
0,I feel so LUCKY ...,feel lucky found...
1,"nice phone, nice...",nice phone nice ...
2,Very pleased,pleased
3,It works good bu...,work good go slo...
4,Great phone to r...,great phone repl...
...,...,...
382010,good rugged phon...,good rugged phon...
382011,used hard,used hard
382012,another great de...,another great de...
382013,Passes every dro...,pass every drop ...


In [None]:
data

Unnamed: 0.1,Unnamed: 0,Rating,Reviews,Unbalance Dataset Label,Reviews_clean,Reviews_punct,clean_tokenizedlower,Reviews_stopwords,Reviews_lemmatizer
0,0,5,I feel so LUCKY ...,1,I feel so LUCKY ...,I feel so LUCKY ...,"[i, feel, so, lu...","[feel, lucky, fo...",feel lucky found...
1,1,4,"nice phone, nice...",1,nice phone nice ...,nice phone nice ...,"[nice, phone, ni...","[nice, phone, ni...",nice phone nice ...
2,2,5,Very pleased,1,Very pleased,Very pleased,"[very, pleased]",[pleased],pleased
3,3,4,It works good bu...,1,It works good bu...,It works good bu...,"[it, works, good...","[works, good, go...",work good go slo...
4,4,4,Great phone to r...,1,Great phone to r...,Great phone to r...,"[great, phone, t...","[great, phone, r...",great phone repl...
...,...,...,...,...,...,...,...,...,...
382010,413832,4,good rugged phon...,1,good rugged phon...,good rugged phon...,"[good, rugged, p...","[good, rugged, p...",good rugged phon...
382011,413834,1,used hard,0,used hard,used hard,"[used, hard]","[used, hard]",used hard
382012,413835,5,another great de...,1,another great de...,another great de...,"[another, great,...","[another, great,...",another great de...
382013,413837,5,Passes every dro...,1,Passes every dro...,Passes every dro...,"[passes, every, ...","[passes, every, ...",pass every drop ...


In [None]:
pd.set_option("display.max_colwidth", 1)
display(data.iloc[3])

Unnamed: 0                 3                                                                                             
Rating                     4                                                                                             
Reviews                    It works good but it goes slow sometimes but its a very good phone I love it                  
Unbalance Dataset Label    1                                                                                             
Reviews_clean              It works good but it goes slow sometimes but its a very good phone I love it                  
Reviews_punct              It works good but it goes slow sometimes but its a very good phone I love it                  
clean_tokenizedlower       [it, works, good, but, it, goes, slow, sometimes, but, its, a, very, good, phone, i, love, it]
Reviews_stopwords          [works, good, goes, slow, sometimes, good, phone, love]                                       
Reviews_lemmatizer      

In [None]:
df = data.drop(['Reviews_punct','clean_tokenizedlower','Reviews_stopwords', 'Reviews_clean'], axis=1)
df.head(10)

Unnamed: 0.1,Unnamed: 0,Rating,Reviews,Unbalance Dataset Label,Reviews_lemmatizer
0,0,5,I feel so LUCKY ...,1,feel lucky found...
1,1,4,"nice phone, nice...",1,nice phone nice ...
2,2,5,Very pleased,1,pleased
3,3,4,It works good bu...,1,work good go slo...
4,4,4,Great phone to r...,1,great phone repl...
5,5,1,I already had a ...,0,already phone pr...
6,6,2,The charging por...,0,charging port lo...
7,7,2,Phone looks good...,0,phone look good ...
8,8,5,I originally was...,1,originally using...
9,11,5,This is a great ...,1,great product ca...


In [None]:
len(df)

382015

In [None]:
#df.to_csv('amazon_drop_3_label_prepro_lemmat.csv')