So we are performing an 'Sentiment Analysis' project using Bow and TF-IDF 

Importing Libraries

In [1]:
import pandas as pd 
import numpy as np  

In [2]:
##Loading the dataset  
df = pd.read_csv('IMDB Dataset(CSV)')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
##Finding basic info
df.shape

(50000, 2)

In [5]:
##Since we are using it for practise purpose , we can provide less number of rows  
df = df.iloc[:10000]  
df.shape

(10000, 2)

Basic EDA

In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df.duplicated().sum()

17

In [9]:
df.drop_duplicates(inplace = True)

In [None]:
df.duplicated().sum()   
##We have deleted all the duplicates

0

In [11]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9983 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     9983 non-null   object
 1   sentiment  9983 non-null   object
dtypes: object(2)
memory usage: 234.0+ KB


In [14]:
df['sentiment'].value_counts(normalize=True)

sentiment
positive    0.503155
negative    0.496845
Name: proportion, dtype: float64

In [15]:
##we can convert 'positive' -> 1
## and 'negative' -> 0  
df['sentiment'] = df['sentiment'].map({'positive' : 1,
                                       'negative' :0})

In [16]:
##We can also convert their datatype to 'int'  
df['sentiment'] = df['sentiment'].astype(int)

In [None]:
df['sentiment'].dtype  

dtype('int32')

In [18]:
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1


Text Preprocessing

In [20]:
##lower case all w
df['review'] =df['review'].str.lower()

In [21]:
##Data cleaning  
import nltk  
import re  
from nltk.corpus import stopwords  
from bs4 import BeautifulSoup

In [23]:
# Load stopwords only once
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text)  # remove URLs
    text = BeautifulSoup(text, 'html.parser').get_text()  # remove HTML
    text = re.sub('[^a-zA-Z0-9]', ' ', text)  # remove special characters
    text = " ".join([word for word in text.split() if word.lower() not in stop_words])  # remove stopwords
    text = " ".join(text.split())  # remove extra spaces
    return text

# Apply the function once
df['review'] = df['review'].apply(clean_text)

In [24]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1


In [25]:
##Now we need to lemmatize the words, so that we can extract their base value 
from nltk.stem import WordNetLemmatizer  
lemma = WordNetLemmatizer()  

def lemmatize_words(text):
    return " ".join([lemma.lemmatize(word) for word in text.split()])

In [26]:
df['review'] = df['review'].apply(lambda x : lemmatize_words(x))

In [None]:
df.head()  
##Lemmatization completed

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode h...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stunnin...,1


Performing Train-Test Split

In [29]:
from sklearn.model_selection import train_test_split 
X_train , X_test , y_train , y_test = train_test_split(df['review'],df['sentiment'],test_size=0.25 ,random_state=42)

In [30]:
X_train.shape , X_test.shape ,y_train.shape ,y_test.shape

((7487,), (2496,), (7487,), (2496,))

Model training with BOW and N-grams

In [31]:
from sklearn.feature_extraction.text import CountVectorizer  
bow  = CountVectorizer(binary=True ,ngram_range=(1,2))

In [32]:
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

In [35]:
X_train_bow.toarray

<bound method _cs_matrix.toarray of <7487x648543 sparse matrix of type '<class 'numpy.int64'>'
	with 1578179 stored elements in Compressed Sparse Row format>>

In [36]:
X_test_bow.toarray

<bound method _cs_matrix.toarray of <2496x648543 sparse matrix of type '<class 'numpy.int64'>'
	with 345884 stored elements in Compressed Sparse Row format>>

Model Training

In [37]:
from sklearn.naive_bayes import MultinomialNB  
nb_bow = MultinomialNB()

In [38]:
nb_bow.fit(X_train_bow,y_train)

In [39]:

y_pred_bow = nb_bow.predict(X_test_bow)

In [40]:
##Performance metrics
from sklearn.metrics import accuracy_score ,classification_report,confusion_matrix  

print(accuracy_score(y_test , y_pred_bow))
print(confusion_matrix(y_test , y_pred_bow))
print(classification_report(y_test , y_pred_bow))


0.8689903846153846
[[1100  136]
 [ 191 1069]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1236
           1       0.89      0.85      0.87      1260

    accuracy                           0.87      2496
   macro avg       0.87      0.87      0.87      2496
weighted avg       0.87      0.87      0.87      2496



Model Training with TF-IDF and N-grams

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidf = TfidfVectorizer(binary=True,ngram_range=(1,2))

In [43]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [44]:
X_train_tfidf

<7487x648543 sparse matrix of type '<class 'numpy.float64'>'
	with 1578179 stored elements in Compressed Sparse Row format>

Model Training

In [46]:
from sklearn.naive_bayes import MultinomialNB  
nb_tfidf = MultinomialNB()

In [47]:
nb_tfidf.fit(X_train_tfidf,y_train)

In [48]:
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

In [49]:
##Performance metrics  
print(accuracy_score(y_test , y_pred_tfidf))
print(confusion_matrix(y_test , y_pred_tfidf))
print(classification_report(y_test , y_pred_tfidf))

0.8794070512820513
[[1103  133]
 [ 168 1092]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      1236
           1       0.89      0.87      0.88      1260

    accuracy                           0.88      2496
   macro avg       0.88      0.88      0.88      2496
weighted avg       0.88      0.88      0.88      2496



Overall the model has performed well in both the cases