# Sentiment Analysis

In [1]:
import pandas as pd
input_data = pd.read_csv('C:\\Users\\The Dark Knight\\Desktop\\Applications\\Code\\Text_Mining\\User_Reviews_Data\\'\
                         'User_Movie_Reviews.csv')

In [2]:
input_data.shape

(2000, 2)

In [3]:
input_data.columns

Index(['class', 'text'], dtype='object')

In [6]:
input_data.head(10)

Unnamed: 0,class,text
0,Pos,stuart little is one of the best family ...
1,Neg,a movie like mortal kombat annihilation wor...
2,Neg,and just when you thought joblo was getting a...
3,Pos,every now and then a movie comes along from a...
4,Neg,for about twenty minutes into mission impossi...
5,Neg,for better or worse the appearance of basic...
6,Neg,i have a great idea for a movie one that ca...
7,Pos,if he doesn=92t watch out mel gibson is in ...
8,Pos,if there s one thing in common about all of h...
9,Neg,if you haven t plunked down your hard earned ...


In [7]:
# Class frequency
input_data['class'].value_counts()

Pos    1000
Neg    1000
Name: class, dtype: int64

In [16]:
# Creating Document Term Matrix(DTM) without preprocessing
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_1 = CountVectorizer()
dtm_raw = pd.DataFrame(vectorizer_1.fit_transform(input_data['text']).toarray(), \
                       columns=vectorizer_1.get_feature_names_out(), index=None)

#Adding label column
dtm_raw['class'] = input_data['class']
dtm_raw.head()

# The output contains many non-meaningful terms as we did not preprocess

Unnamed: 0,00,000,0009f,007,00s,03,04,05,05425,10,...,zukovsky,zulu,zundel,zurg,zus,zweibel,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Writing a custom Tokenizer for preprocessing

import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk

stemmer = SnowballStemmer('english')
stop_words = stopwords.words('english')

# Custom tokenizer that removes stopwords and then stems and tokenizes words
def tokenize(text):
    text = re.sub(r'\W+|\d+|_', ' ', text)    # removing numbers and punctuations and underscores
    tokens_pre = nltk.word_tokenize(text)     # tokenizing
    
    # Exclude stopwords from text and then stem the words
    tokens = [stemmer.stem(t) for t in tokens_pre if t not in stop_words]
    return tokens

vectorizer_2 = CountVectorizer(min_df= 5, tokenizer=tokenize)
dtm_refined = pd.DataFrame(vectorizer_2.fit_transform(input_data['text']).toarray(), \
                           columns=vectorizer_2.get_feature_names_out(), index=None)

dtm_refined['class'] = input_data['class']
dtm_refined.head()

# The output now is much refined after preprocessing

Unnamed: 0,aaron,abandon,abbi,abduct,abel,abil,abl,abli,aboard,abod,...,zip,zipper,zoe,zombi,zone,zoo,zoom,zorro,zucker,zwick
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
dtm_refined.shape

(2000, 9310)

In [54]:
# Building training and testing sets
train_dataset = dtm_refined[:1300]
test_dataset = dtm_refined[1300:]

test_dataset.shape

(700, 9310)

In [55]:
# Training the model. Our model here is Naive Bayes classifier for multinomial models

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
X_train= train_dataset.drop(['class'], axis=1)

# Fitting model to our data
clf.fit(X_train, train_dataset['class'])

In [56]:
# Testing the model

X_test= test_dataset.drop(['class'], axis=1)
clf.score(X_test, test_dataset['class'])

0.7971428571428572

In [68]:
# Using the model for prediction

predicted_sentiment=clf.predict(test_dataset.drop('class', axis=1))
print(predicted_sentiment)

['Neg' 'Pos' 'Pos' 'Neg' 'Pos' 'Neg' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Pos'
 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Pos' 'Neg' 'Pos'
 'Pos' 'Pos' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg'
 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Neg'
 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Pos' 'Pos' 'Pos' 'Neg' 'Neg'
 'Pos' 'Neg' 'Pos' 'Neg' 'Pos' 'Neg' 'Neg' 'Neg' 'Pos' 'Neg' 'Neg' 'Pos'
 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Pos' 'Neg' 'Neg' 'Pos'
 'Neg' 'Neg' 'Pos' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg'
 'Neg' 'Neg' 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos'
 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Neg'
 'Neg' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg'
 'Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos'
 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos'
 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'P

In [64]:
actual_sentiment = test_dataset['class'].to_numpy()
print(actual_sentiment)

['Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos' 'Pos' 'Pos' 'Neg' 'Pos'
 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Neg' 'Pos'
 'Pos' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos' 'Neg'
 'Neg' 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Neg'
 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg'
 'Pos' 'Neg' 'Pos' 'Neg' 'Pos' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg'
 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Pos' 'Neg' 'Neg' 'Pos'
 'Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos' 'Pos' 'Pos'
 'Neg' 'Neg' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Pos'
 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Pos'
 'Pos' 'Pos' 'Pos' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Neg' 'Neg'
 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'Neg' 'Neg' 'Pos' 'Neg'
 'Neg' 'Pos' 'Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Pos' 'Neg' 'Neg' 'Pos' 'Pos'
 'Pos' 'Neg' 'Neg' 'Pos' 'Neg' 'Pos' 'Neg' 'Neg' 'P

In [79]:
length = len(predicted_sentiment.tolist())
correct = 0

for i in range(length):
    if(predicted_sentiment.tolist()[i] == actual_sentiment.tolist()[i]):
        correct = correct + 1


print(correct/length)

0.7971428571428572


In [None]:
# Matches with the test score of model