# **SENTIMENT REVIEW**

Please import the Reviews.csv dataset into google colab and run the code cell by cell in order

# **# 1.Importing libraries and reading file into a dataframe**

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk

import string
from nltk.corpus import stopwords



# read in the data
df = pd.read_csv('Reviews.csv')



df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **2. Processing the data**

In [None]:
#cleaning the dataset
df.dropna(inplace=True)
#making all text small case and clearing punchuations
# get the list of stop words and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# combine stop words and punctuation into a single set
waste_words = stop_words.union(punctuation)

def tokenize_and_remove_punctuation(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word not in waste_words]  # Remove punctuation and stop words
    return tokens

# apply the function to the 'review' column
df['review'] = df['review'].apply(tokenize_and_remove_punctuation)
df.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, br, br, filmin...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, 's, family, little, boy, jake, thi...",negative
4,"[petter, mattei, 's, ``, love, time, money, ''...",positive


In [None]:
# encoding postive as 1 and negative as zero
labelencoder = LabelEncoder()
df['sentiment'] = labelencoder.fit_transform(df['sentiment'])
df.head()


Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",1
1,"[wonderful, little, production, br, br, filmin...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, 's, family, little, boy, jake, thi...",0
4,"[petter, mattei, 's, ``, love, time, money, ''...",1


In [None]:
lemmatizer = nltk.WordNetLemmatizer()

# function to lemmatize a list of tokens
def lemmatize_tokens(tokens):
    return ' '.join([lemmatizer.lemmatize(token) for token in tokens])  # Lemmatize each token in the list

# Apply the function to the 'review' column (which already contains lists of tokens)
df['review'] = df['review'].apply(lemmatize_tokens)

df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode '...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically 's family little boy jake think 's z...,0
4,petter mattei 's `` love time money '' visuall...,1


# **3. Splitting the data into training and testing set**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], random_state=0)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)



In [None]:
#finding length of volacbulary
len(vect.get_feature_names_out())

84346

In [None]:
#seeing the vocabulary
vect.get_feature_names_out()[100::1000]

array(['10th', '757', 'adhere', 'alisan', 'anny', 'artisan', 'axed',
       'basketball', 'besh', 'blossom', 'brainscan', 'bungalow',
       'captian', 'chaillot', 'chu', 'cognizant', 'congregates',
       'countermeasure', 'cured', 'decay', 'descibe', 'discharge',
       'dooku', 'dutiful', 'elucidate', 'eroticised', 'exposé',
       'feinstone', 'flica', 'freelance', 'garrone', 'gleefully',
       'greenthumb', 'haiti', 'hebetude', 'hoje', 'hurtles', 'incentive',
       'intentioned', 'jakoby', 'jungley', 'kiley', 'kyon', 'left',
       'llewelyn', 'm60', 'manèged', 'mcguffan', 'michio', 'mohamed',
       'muling', 'necessitates', 'nosedived', 'ompuri', 'overreact',
       'patet', 'pfennig', 'plugged', 'predjudice', 'psm', 'ragtag',
       'redeem', 'rereleased', 'rivio', 'rutledge', 'scalping', 'seize',
       'shifafa', 'sitcom', 'sobbingly', 'spittle', 'stillbirth',
       'sudetanland', 'sylvester', 'telekenisis', 'thyself', 'tradition',
       'tunny', 'underway', 'unwavering',

In [None]:
#converting the data into a usable matrix
#we can see that the number of coloums in the matrix is the number of vocabulary words found earlier
X_train_vectorized = vect.transform(X_train)

X_train_vectorized
# sparse matrix is the output

<37500x84346 sparse matrix of type '<class 'numpy.int64'>'
	with 3650519 stored elements in Compressed Sparse Row format>

# **4. Model Building**

# ***naive_bayes.MultinomialNB() model***

In [None]:
# model building and training
from sklearn import naive_bayes

model = naive_bayes.MultinomialNB()

model.fit(X_train_vectorized,y_train)

In [None]:
y_predict = model.predict(vect.transform(X_test))

y_predict



array([1, 0, 1, ..., 1, 0, 1])

In [None]:
# model evaluation
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


print('AUC: ', roc_auc_score(y_test, y_predict))
print(f'F1 Score: ',f1_score(y_test, y_predict))

#not very suitable

AUC:  0.8547446483392989
F1 Score:  0.8509694380545514


# **Using TfidfVectorizer**
We are trying to reduced the data input by reducing not so frequent words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# min_df represents minimum number of doucments in which words must be in order to be a part of the vocabulary
vect2 = TfidfVectorizer(min_df=5).fit(X_train)
len(vect2.get_feature_names_out())

29485

As we can see the features are less

In [None]:
from sklearn.linear_model import LogisticRegression
model2 = naive_bayes.MultinomialNB()
model2.fit(X_train_vectorized, y_train)

predictions = model2.predict_proba(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))

AUC:  0.8547446483392989
F1 Score:  0.8509694380545514


Same accuracy as before when we did not use tfidfVectorizer

# ***Logistic Regression Model***

In [None]:
model3 = LogisticRegression()
model3.fit(X_train_vectorized, y_train)

predictions = model3.predict_proba(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AUC:  0.8547446483392989
F1 Score:  0.8509694380545514
