<a href="https://colab.research.google.com/github/Raghvender1205/SentimentAnalysis_MajorProject/blob/master/Major_Project_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Major Project SmartKnower Internship

## Problem Statement
Sentiment Analysis use ML Based Approach

In [None]:
# Import Libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # vectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC # Supported Vector Machine
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # metrics

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from textblob import TextBlob, Word
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS
import spacy
import re
import string
import unicodedata
import os
import warnings
warnings.filterwarnings('ignore')

print(os.listdir('/content/drive/MyDrive/SmartKnower/MajorProject'))

['Major Project Sentiment Analysis.ipynb', 'IMDB Dataset.csv']


## Data

I am using IMDB 50K Movie Reviews Dataset, from Kaggle Website

Link: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
data = pd.read_csv('/content/drive/MyDrive/SmartKnower/MajorProject/IMDB Dataset.csv')

In [None]:
print(data.shape)

data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### EDA on the Dataset

In [None]:
# Summary
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
data.columns

Index(['review', 'sentiment'], dtype='object')

### Sentiment Count
The number of occurrences of positive and negative words in each document was counted to determine the document's sentiment score.

In [None]:
data['sentiment'].value_counts() # Balanced dataset

positive    25000
negative    25000
Name: sentiment, dtype: int64

### Split Dataset

Split the Dataset into train and test set

In [None]:
# Train
train_reviews = data.review[:40000]
train_sentiments = data.sentiment[:40000]

# Test
test_reviews = data.review[40000:]
test_sentiments = data.sentiment[40000:]

print(train_reviews.shape, train_sentiments.shape)
print(test_reviews.shape, test_reviews.shape)

(40000,) (40000,)
(10000,) (10000,)


### Text Normalization

Text normalization is the process of transforming text into a single canonical form that it might not have had before.


Make sure to download NLTK Stopwords first
```python
>>> import nltk
>>> nltk.download('stopwords')
```

In [None]:
%%bash
python

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Tokenization of Text
tokenizer = ToktokTokenizer()

# Setting English stopwords
stopword_list = nltk.corpus.stopwords.words('english') 

### Removing html strips and noise text

Removing html strips like eg: ```"<b></b>"``` tags and some regex text

In [None]:
def strip_html(text):
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()

# Removing the square brackets
def remove_brackets(text):
  return re.sub('\[[^]]*\]', '', text)


# Removing the noisy text
def denoise_text(text):
  text = strip_html(text)
  text = remove_brackets(text)
  return text

# Apply function on the dataset
data['review'] = data['review'].apply(denoise_text)

In [None]:
data.head(10) # See the difference from above

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Removing Some Special Characters

Removing some special characters like regex expressions.

In [None]:
def remove_special_char(text, remove_digits=True):
  pattern = r'[^a-zA-z0-9\s]'
  text = re.sub(pattern, '', text)
  return text

# Apply
data['review'] = data['review'].apply(remove_special_char)

In [None]:
data.head(10) # Difference...!!!

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive
5,Probably my alltime favorite movie a story of ...,positive
6,I sure would like to see a resurrection of a u...,positive
7,This show was an amazing fresh innovative ide...,negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Text Stemming

It is the process of removing inflected or derivative words to their word stem either base or root form

In [None]:
# Stemming
def SimpleStemmer(text):
  ps = PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

# Apply 
data['review'] = data['review'].apply(SimpleStemmer)

### Removing stopwords

stop words are words which are filtered out before or after processing of natural language data.

Useless words are ```stopwords```.

In [None]:
# Set stopwords to English
stop = set(stopwords.words('english'))
print(stop)

def remove_stopwords(text, is_lower_case=False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
  
# Apply Function
data['review'] = data['review'].apply(remove_stopwords)

{'a', 'shan', 'now', 'her', "wouldn't", 'had', 'his', 'me', 'do', 'but', "that'll", 'were', 'down', "you'd", 'theirs', 'before', 'been', 's', 'mustn', 'just', 'after', "she's", 'was', 'above', 'below', "don't", 'who', 'needn', 'are', 'your', 'such', 'itself', 'my', 'can', "didn't", 'them', 'then', 'hers', 'didn', "shan't", 'wasn', 'these', 'she', 'into', 'so', 'am', 'having', 'while', 'why', 'shouldn', "it's", 'themselves', 'by', 'between', 'both', 'doesn', 'too', 'did', "hasn't", 'isn', "you're", 'through', 'those', 'where', 'm', 'is', 'o', 'aren', "needn't", "haven't", 'when', "shouldn't", 'have', 'very', "weren't", 'most', 'not', 'i', 'we', 'ain', 'there', 'no', "should've", 'should', 'being', 'does', 'few', "aren't", 'for', 'won', 'hasn', 'herself', 'from', "hadn't", "mustn't", 'this', 'during', 'own', "you'll", 'y', 'of', 'than', 'that', "couldn't", 'd', 'has', 'yourselves', 'himself', 've', 'to', 'hadn', 'ma', 'mightn', 'all', 'he', 'again', 'any', 'only', 'because', 'out', 'they

### Normalized Train and Test Reviews

In [None]:
# Normalized Train Reviews
norm_train_reviews = data.review[:40000] 
norm_train_reviews[0]

'one review ha mention watch 1 Oz episod youll hook right thi exactli happen meth first thing struck Oz wa brutal unflinch scene violenc set right word GO trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call OZ nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda Em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast Oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

In [None]:
# Normalized Test Reviews
norm_test_reviews = data.review[40000:]
norm_test_reviews[45005]

'read review watch thi piec cinemat garbag took least 2 page find somebodi els didnt think thi appallingli unfunni montag wasnt acm humour 70 inde ani era thi isnt least funni set sketch comedi ive ever seen itll till come along half skit alreadi done infinit better act monti python woodi allen wa say nice piec anim last 90 second highlight thi film would still get close sum mindless drivelridden thi wast 75 minut semin comedi onli world semin realli doe mean semen scatolog humour onli world scat actual fece precursor joke onli mean thi handbook comedi tit bum odd beaver niceif pubesc boy least one hand free havent found playboy exist give break becaus wa earli 70 way sketch comedi go back least ten year prior onli way could even forgiv thi film even made wa gunpoint retro hardli sketch clown subtli pervert children may cut edg circl could actual funni come realli quit sad kept go throughout entir 75 minut sheer belief may save genuin funni skit end gave film 1 becaus wa lower scoreand

### Bags of Words Model / Vectorization

It is a way of extracting features from text for use in modeling.

A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:
1. A vocabulary of known words
2. A measure of the presence of known words

In here it is used to convert text documents to numerical vectors or ```bag of words```

In [None]:
# Count Vectorizer
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1, 3))

# Transformed Train and Test Reviews
cv_train_reviews = cv.fit_transform(norm_train_reviews)
cv_test_reviews = cv.fit_transform(norm_test_reviews)

print('BOW_cv_train: ', cv_train_reviews.shape)
print('BOW_cv_test: ', cv_test_reviews.shape)

BOW_cv_train:  (40000, 6209089)
BOW_cv_test:  (10000, 1828357)


In [None]:
vocab = cv.get_feature_names() # Get Feature Names

vocab

['00 agent',
 '00 agent difficult',
 '00 come',
 '00 come back',
 '00 includ',
 '00 includ unusu',
 '00 product',
 '00 product compani',
 '00 schneider',
 '00 schneider murderervillain',
 '00 still',
 '00 still hold',
 '00 wife',
 '00 wife usual',
 '000',
 '000 produc',
 '000 produc thi',
 '00015',
 '00015 second',
 '00015 second prod',
 '001',
 '001 laughomet',
 '001 laughomet 1000',
 '002',
 '002 hope',
 '002 hope thi',
 '007 adventur',
 '007 adventur much',
 '007 appear',
 '007 appear script',
 '007 atmosphereon',
 '007 atmosphereon hand',
 '007 bruce',
 '007 bruce lee',
 '007 film',
 '007 film made',
 '007 follow',
 '007 follow ha',
 '007 franchis',
 '007 franchis star',
 '007 frwl',
 '007 frwl come',
 '007 gadget',
 '007 gadget equal',
 '007 game',
 '007 game producedfantast',
 '007 goldeney',
 '007 goldeney one',
 '007 look',
 '007 look like',
 '007 movi baron',
 '007 movi eas',
 '007 movi sean',
 '007 movi wooden',
 '007 movieswhat',
 '007 movieswhat put',
 '007 music',
 '007 mu

#### Term Frequency-Inverse Frequency Model (TFIdf)

It is used to convert text documents to matrix of tfidf features.

The ```tf-idf``` value increases in proportion to the number of times a word appears in the document but is often offset by the frequency of the word in the corpus

In [None]:
# TFIdf Vectorizer
tfv = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1, 3))

# Transformed Train and Test reviews
tfv_train_reviews = tfv.fit_transform(norm_train_reviews)
tfv_test_reviews = tfv.fit_transform(norm_test_reviews)

print('Tfidf_train: ', tfv_train_reviews.shape)
print('Tfidf_test: ', tfv_test_reviews.shape)

Tfidf_train:  (40000, 6209089)
Tfidf_test:  (10000, 1828357)


### Labeling the Sentiment Text

In [None]:
lb = LabelBinarizer()

# Transformed Sentiment Data
sentiment_data = lb.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


### Split the Sentiment Data

In [None]:
# Splitting the data
train_sentiments = sentiment_data[:40000]
test_sentiments = sentiment_data[40000:]

print(train_sentiments)
print(test_sentiments) ## Done!!!

[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


### Modeling the Dataset

Build a Logistic Regression Model for both ```bag of words``` and ```Tfidf``` features

In [None]:
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

# Fitting the model for Bag of Words
lr_bow = lr.fit(cv_train_reviews, train_sentiments)
print(lr_bow)

# Fitting the model for Tfidf features
lr_tfidf = lr.fit(tfv_train_reviews, train_sentiments)
print(lr_tfidf)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


#### <b>Logistic Regression Model Performance</b>

In [None]:
# Predicting for Bag of Words
lr_bow_predict = lr_bow.predict(cv_test_reviews)
print(lr_bow_predict)

# Predicting for TFidf features
lr_tfidf_predict = lr_tfidf.predict(tfv_test_reviews)
print(lr_tfidf_predict)

ValueError: ignored