In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sachi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sachi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sachi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

## Loading the data

In [16]:
df= pd.read_csv("review_data.csv")

In [17]:
df["ReviewText"]=df["ReviewText"].apply(lambda x: re.sub('<.*?>','',x))

In [18]:
df=df[df.Score!=3]

In [19]:
df.loc[df['Score']<3,'Score'] = 0
df.loc[df['Score']>3,'Score'] = 1

In [20]:
df.Score.value_counts()

1    443777
0     82037
Name: Score, dtype: int64

In [21]:
X= df[['ReviewText']]
y = df['Score']

In [22]:
X

Unnamed: 0,ReviewText
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wide...
...,...
568449,Great for sesame chicken..this is a good if no...
568450,I'm disappointed with the flavor. The chocolat...
568451,"These stars are small, so you can give 10-15 o..."
568452,These are the BEST treats for training and rew...


### Data Preprocessing

In [23]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [24]:
stop_words = stopwords.words('english')

In [32]:
def preprocess(raw_text):
    text = str(raw_text).lower()
    tokens = word_tokenize(text)
    
    tokens = [word for word in tokens if word not in stop_words]
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    word_corpus = ' '.join(tokens)
    
    return word_corpus

In [33]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [36]:
temp_df = X['ReviewText'].progress_apply(lambda x: preprocess(x))

100%|█████████████████████████████████████████████████████████████████████████| 525814/525814 [19:59<00:00, 438.43it/s]


In [38]:
X=pd.DataFrame(temp_df)

### Splitting the data

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=100) 

In [40]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((420651, 1), (105163, 1), (420651,), (105163,))

### Converting Text to Numerical vectors- BOW Representation

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

vocab = CountVectorizer()
vocab.fit(X_train['ReviewText'])

X_train_bow = vocab.transform(X_train['ReviewText'])

In [42]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_train_bow))

print("Shape of input data:", X_train_bow.shape)

Total unique words: 116101
Type of train features: <class 'scipy.sparse.csr.csr_matrix'>
Shape of input data: (420651, 116101)


In [43]:
X_test_bow = vocab.transform(X_test['ReviewText'])

## Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

LogisticRegression()

In [46]:
y_test_pred = classifier.predict(X_test_bow)

In [47]:
y_test_pred = classifier.predict(X_test_bow)
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.938742713692078
              precision    recall  f1-score   support

           0       0.84      0.74      0.79     16409
           1       0.95      0.97      0.96     88754

    accuracy                           0.94    105163
   macro avg       0.90      0.86      0.88    105163
weighted avg       0.94      0.94      0.94    105163



### Naive Bayes

In [49]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

In [50]:
nb.fit(X_train_bow, y_train)

MultinomialNB()

In [53]:
y_test_pred = nb.predict(X_test_bow)

In [56]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.9125072506489925

### Support Vector Machines: Linear SVC

In [57]:
from sklearn import svm
svc = svm.LinearSVC()

In [58]:
svc.fit(X_train_bow, y_train)

LinearSVC()

In [60]:
y_test_pred = svc.predict(X_test_bow)

In [61]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.9385525327348972