<a href="https://colab.research.google.com/github/VellummyilumVinoth/Sentiment_Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset description

IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.

In [2]:
import numpy as np
import pandas as pd


In [11]:
data = pd.read_csv('/content/IMDB Dataset.csv')

In [12]:
# first 5 rows
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
# last 5 rows
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [14]:
# find columns in the dataset
data.columns

Index(['review', 'sentiment'], dtype='object')

In [15]:
# find the shape of the data(rows, columns)
data.shape

(50000, 2)

In [16]:
# description of the data
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [17]:
# check any null values or outliers in the dataset
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [18]:
# count the null values or outliers in the dataset
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [19]:
# calculate positive and negative feedback in the dataset
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# Text normalization

## Tokenization

In [20]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize


In [21]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup



In [22]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
# Tokenization of text
tokenizers = ToktokTokenizer()

# Setting English stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [24]:
def noiseremoval_text(text):
  soup = BeautifulSoup(text,"html.parser")
  text = soup.get_text()
  text = re.sub('\[[^]]*\]', '', text)
  return text

In [25]:
# Apply function on revirew column
data['review'] = data['review'].apply(noiseremoval_text)

In [26]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Stemming

In [27]:
# Stemming the text
def stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

In [28]:
# Apply function on review column
data['review'] = data['review'].apply(stemmer)

In [37]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


## Removing stop words

In [30]:
# removing the stop words
def removing_stopwords(text, is_lower_case = False):
  # Tokenization of text
  tokenizers = ToktokTokenizer()
  # Setting English stopwords
  tokens = tokenizers.tokenize(text)
  tokens = [i.strip() for i in tokens]
  if is_lower_case:
    filtokens = [i for i in tokens if token not in stopwords]
  else:
    filtokens = [i for i in tokens if i.lower() not in stopwords]
  filtered_texts = ' '.join(filtokens)
  return filtered_texts

In [31]:
# Apply function on review column
data['review'] = data['review'].apply(removing_stopwords)

In [32]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


# Train test split

In [33]:
# split the dataset
# train dataset
train_reviews_data = data.review[:30000]

In [34]:
# test dataset
test_reviews_data = data.review[30000:]

# Bag of words

In [35]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df = 0, max_df = 1, binary = False, ngram_range = (1,3))
# transformed train reviews
cv_train = cv.fit_transform(train_reviews_data)
# transformed test reviews
cv_test = cv.transform(test_reviews_data)

print('BOW_cv_train: ', cv_train.shape)
print('BOW_cv_test: ',cv_test.shape)
# vocab = cv.get_feature_names()- toget feature names

BOW_cv_train:  (30000, 4954557)
BOW_cv_test:  (20000, 4954557)


# TF_IDF

In [36]:
# Tfidf vectorizer
tf = TfidfVectorizer(min_df = 0, max_df = 1, use_idf = True, ngram_range = (1,3))
# transformed train reviews
tf_train = tf.fit_transform(train_reviews_data)
# transformed test reviews
tf_test = tf.transform(test_reviews_data)
print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (30000, 4954557)
Tfidf_test: (20000, 4954557)


# Label encoding

In [38]:
# labeling the sentiment data
label = LabelBinarizer()
# transformed sentiment data
sentiment_data = label.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [41]:
sentiment_data

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [42]:
train_data = data.sentiment[:30000]

In [40]:
test_data = data.sentiment[30000:]

In [44]:
# training the model
logistic = LogisticRegression(penalty = 'l2', max_iter = 500, C = 1, random_state = 42)
# Fitting the model for Bag of Words
lr_bow = logistic.fit(cv_train, train_data)
print(lr_bow)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [45]:
# predicting the model for bag of words
lr_bow_predict = logistic.predict(cv_test)
print(lr_bow_predict)

['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']


In [46]:
# Accuracy score for bag of words
lr_bow_score = accuracy_score(test_data, lr_bow_predict)
print("lr_bow_score :", lr_bow_score)

lr_bow_score : 0.59215


In [48]:
# fitting the model for tfidf features
lr_tfidf = logistic.fit(tf_train, train_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [49]:
# predicting the model for tfidf features
lr_tfidf_predict = logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [50]:
# Accuracy score for tfidf features
lr_tfidf_score = accuracy_score(test_data, lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7426
