In [225]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import openpyxl
import nltk
from bs4 import BeautifulSoup
import sklearn


import warnings
warnings.filterwarnings('ignore')

In [226]:
os.listdir("../Datasets")

['1.py', 'amazon_reviews.csv', 'IMDB_Dataset.xlsx']

In [227]:
imdb_df = pd.read_excel("../Datasets/IMDB_Dataset.xlsx")
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 1. Exploratory Data Analysis 

In [228]:
imdb_df.describe()

Unnamed: 0,review,sentiment
count,1114,1114
unique,1114,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,558


In [229]:
imdb_df['sentiment'].value_counts()

sentiment
positive    558
negative    556
Name: count, dtype: int64

In [230]:
# Text Normalization 
# 1. Removing Stop words

stop_words_list = nltk.corpus.stopwords.words('english')
stop_words_list[:3]


['i', 'me', 'my']

In [231]:
# 2. Removing html strips  and punctuations
import string

exclude_list = string.punctuation

def strip_html(text):
    bsp = BeautifulSoup(text,"html.parser")
    return bsp.get_text()

def remove_punc(text):
    for char in exclude_list:
        txt = text.replace(char,"")
    return txt


def remove_noise_data(text):
    text = strip_html(text)
    text = remove_punc(text)
    return text


In [232]:
imdb_df['review_clean'] = imdb_df['review'].apply(remove_noise_data)

In [233]:
imdb_df.head(10)

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."
5,"Probably my all-time favorite movie, a story o...",positive,"Probably my all-time favorite movie, a story o..."
6,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i...",negative,"This show was an amazing, fresh & innovative i..."
8,Encouraged by the positive comments about this...,negative,Encouraged by the positive comments about this...
9,If you like original gut wrenching laughter yo...,positive,If you like original gut wrenching laughter yo...


# Stemming 
    * Its process of reducing word to its word stem
    * eg: we have review as "eating,eat,eaten ",all these words refers to same so we no need to have all these words instead of that just have one word.


In [234]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def stem_review(text):
    # Initialize the Lemmatizer
    lt = WordNetLemmatizer()

    # Tokenize and lemmatize each word
    tokens = word_tokenize(text)
    lemmatized_tokens = [lt.lemmatize(word,pos='v') for word in tokens]

    text = ' '.join(lemmatized_tokens)
    return text



In [235]:
imdb_df['review_clean_v1'] = imdb_df['review_clean'].apply(stem_review)

In [236]:
imdb_df.head(10)

Unnamed: 0,review,sentiment,review_clean,review_clean_v1
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,One of the other reviewers have mention that a...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,A wonderful little production . The film techn...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,I think this be a wonderful way to spend time ...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,Basically there 's a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love in the Time of Money ...
5,"Probably my all-time favorite movie, a story o...",positive,"Probably my all-time favorite movie, a story o...","Probably my all-time favorite movie , a story ..."
6,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,I sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i...",negative,"This show was an amazing, fresh & innovative i...","This show be an amaze , fresh & innovative ide..."
8,Encouraged by the positive comments about this...,negative,Encouraged by the positive comments about this...,Encouraged by the positive comment about this ...
9,If you like original gut wrenching laughter yo...,positive,If you like original gut wrenching laughter yo...,If you like original gut wrench laughter you w...


## Removing Stop words

In [237]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer once
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def remove_stop_words(text):
    # Tokenize into sub words
    tokens = tokenizer.tokenize(text)

    # Remove stop words and punctuation and numbers in one go
    tokens =[token.strip() for token in tokens if token not in stop_words_list and token not in string.punctuation and not token.isdigit()]

    # Reassemble the tokens into a single string
    text = ' '.join(tokens)
    
    return text

In [238]:
imdb_df['review_clean_v2'] = imdb_df['review_clean_v1'].apply(remove_stop_words)

In [239]:
imdb_df.head()

Unnamed: 0,review,sentiment,review_clean,review_clean_v1,review_clean_v2
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,One of the other reviewers have mention that a...,one reviewers mention watch oz episode hook ri...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,A wonderful little production . The film techn...,wonderful little production film technique una...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,I think this be a wonderful way to spend time ...,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,Basically there 's a family where a little boy...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love in the Time of Money ...,pet ##ter matt ##ei love time money visually s...


In [240]:

final_imdb = imdb_df[['review_clean_v2','sentiment']]
final_imdb.columns = ['review','sentiment']

final_imdb['sentiment'] = final_imdb['sentiment'].map({'positive':1,'negative':0,'neutral':2})
print(final_imdb['sentiment'].value_counts())
final_imdb.head(4)


sentiment
1    558
0    556
Name: count, dtype: int64


Unnamed: 0,review,sentiment
0,one reviewers mention watch oz episode hook ri...,1
1,wonderful little production film technique una...,1
2,think wonderful way spend time hot summer week...,1
3,basically family little boy jake think zombie ...,0


In [241]:
final_imdb.dtypes

review       object
sentiment     int64
dtype: object

# Divide Train and Test data

In [242]:

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(final_imdb['review'],final_imdb['sentiment'],test_size=0.2,random_state=42)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)


(891,) (223,) (891,) (223,)


# Lets start Embedding Models
## 1. Bag of words 

In [243]:
from sklearn.feature_extraction.text import CountVectorizer

def bow(text):
    cv = CountVectorizer()

    #  Fit the model and transform the documents to a Bag of Words representation
    bow = cv.fit_transform([text]) #Input should be iterable so we making it as list

    # Convert the sparse matrix to an array and return it
    return dict(zip(cv.get_feature_names_out(),  bow.toarray()[0]))


imdb_df['bow'] = imdb_df['review_clean_v2'].apply(bow)
imdb_df['bow'].head(3)

0    {'acc': 1, 'agenda': 1, 'agreements': 1, 'appe...
1    {'actors': 1, 'bbc': 1, 'chosen': 1, 'come': 1...
2    {'ada': 1, 'addiction': 1, 'air': 1, 'allen': ...
Name: bow, dtype: object

In [244]:
y_train

381     1
327     0
998     0
959     0
582     1
       ..
466     1
121     1
1044    1
1095    0
860     1
Name: sentiment, Length: 891, dtype: int64

In [245]:
X_test

879     largely forget ##table tale mercenary kerman e...
101     okay last night august 18th distinct displeasu...
1111    saw movie ##s years ago literally sweep away c...
726     tv series one ones love kid even though see pi...
291     reservations movie figure would usual bill far...
                              ...                        
184     ordinary made tv product tyson attempt serious...
875     let first say believer ghost indeed know exist...
507     animation simple straightforward good vs evil ...
722     hear driving lessons itv ad ##vert honest n kn...
583     art film either make national film preservatio...
Name: review, Length: 223, dtype: object

In [246]:
from sklearn.feature_extraction.text import CountVectorizer

# max_features=500: Limits the number of features to the top 500 most frequent words.
# stop_words='english': This removes common English stop words like "the", "and", "is", etc.

cv = CountVectorizer(stop_words='english', max_features=500)
x_train_bow = cv.fit_transform(X_train) # Learn the vocabulary and transform training data into vectors

x_train_bow.toarray()

x_test_bow = cv.transform(X_test) #  Transform the test data using the same vocabulary (without fitting again)


In [247]:
print(x_train_bow.toarray())
print(cv.get_feature_names_out())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]]
['ab' 'able' 'absolutely' 'act' 'action' 'actor' 'actors' 'actress'
 'actual' 'actually' 'ad' 'add' 'age' 'al' 'alien' 'allow' 'ama'
 'american' 'animation' 'ann' 'apparently' 'appear' 'art' 'ask' 'ate'
 'attempt' 'audience' 'away' 'awful' 'bad' 'base' 'beautiful' 'begin'
 'bel' 'believe' 'best' 'better' 'big' 'bite' 'black' 'ble' 'blood' 'bo'
 'body' 'book' 'bore' 'box' 'boy' 'brain' 'break' 'brilliant' 'bring'
 'budget' 'buy' 'ca' 'camera' 'car' 'care' 'case' 'cast' 'catch' 'cause'
 'certainly' 'chance' 'change' 'character' 'che' 'child' 'children'
 'cinema' 'city' 'cl' 'class' 'classic' 'close' 'com' 'come' 'comedy'
 'comment' 'complete' 'completely' 'confuse' 'consider' 'cool' 'cop'
 'country' 'couple' 'course' 'cr' 'create' 'credit' 'cut' 'da' 'dan'
 'dark' 'day' 'days' 'dead' 'deal' 'death' 'decent' 'decide' 'definitely'
 'deliver' 'deserve' 'despite' 'di' 'dial

In [248]:
x_train_bow.shape # as we have taken max_features =500 so it had created with 500 dimension vector using BOW technique

(891, 500)

In [249]:
x_test_bow.shape

(223, 500)

## 1. Apply Naive Bayes Model
    * Naive Bayes is good for text classification like sentiment analysis when we are working on BOW and TF-DIF

In [250]:
# Naive Bayes Model for prediction
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

# Train a classifier (Naive Bayes)
nb_model = MultinomialNB()
nb_model.fit(x_train_bow,y_train)

# Prediction
y_pred = nb_model.predict(x_test_bow)

#Evaluate the Model
print(f"accuracy_score is {accuracy_score(y_test,y_pred):.4f}")
print("classification_report ....")
print(classification_report(y_test,y_pred))

accuracy_score is 0.8610
classification_report ....
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       111
           1       0.90      0.81      0.85       112

    accuracy                           0.86       223
   macro avg       0.86      0.86      0.86       223
weighted avg       0.86      0.86      0.86       223



# 2. Apply Xgboost on BOW
    * Gradient Boosting methods (like XGBoost) can often outperform simpler models but are computationally more expensive

In [251]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train_bow,y_train)
y_pred = xgb_model.predict(x_test_bow)
#Evaluate the Model
print(f"accuracy_score is {accuracy_score(y_test,y_pred):.4f}")
print("classification_report ....")
print(classification_report(y_test,y_pred))

accuracy_score is 0.7982
classification_report ....
              precision    recall  f1-score   support

           0       0.81      0.78      0.79       111
           1       0.79      0.81      0.80       112

    accuracy                           0.80       223
   macro avg       0.80      0.80      0.80       223
weighted avg       0.80      0.80      0.80       223



# 3. Apply Support Vector Machine (SVM) on BOW
    * SVM is often a strong choice for text classification. It works well in high-dimensional spaces (such as those in text data) and can handle both linear and non-linear classification.

In [252]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')  # Linear kernel is usually preferred for text classification
svm_model.fit(x_train_bow,y_train)
y_pred = svm_model.predict(x_test_bow)

#Evaluate the Model
print(f"accuracy_score is {accuracy_score(y_test,y_pred):.4f}")
print("classification_report ....")
print(classification_report(y_test,y_pred))

accuracy_score is 0.7758
classification_report ....
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       111
           1       0.81      0.72      0.76       112

    accuracy                           0.78       223
   macro avg       0.78      0.78      0.78       223
weighted avg       0.78      0.78      0.78       223



In [253]:
def pre_process(txt):
   # Here we are doing all pre processing steps such as stop words, digits, stemming etc
   txt = remove_stop_words(txt)
   txt = stem_review(txt)
   return txt


In [254]:
cv

In [255]:
## Prediction with new data

def predict_analysis(txt):
    # Step 1. Do Pre Process
    txt = pre_process(txt)
    print(f"after process {txt}")

    # Step 2: Apply BOW Technique to create Vector/Embeddings
    X = cv.transform([txt])
    vector = X.toarray() # Converting vector to dense format
    print(f"vector for given sentence is {vector}")
    vocab = cv.get_feature_names_out() # it will return top 500 feature names which is trained on CountVectorizer
    print(f"vocab of BOW is {vocab}")

    # Step 3: Now its time to evaluate with unknown data and predict it.

    sentiment_dict = {1:'positive',0:'negative',2:'neutral'}
    

    pred = nb_model.predict(vector)
    print('prediction result is ...................')
    print(f"Pred is {sentiment_dict[pred[0]]}")



In [256]:

predict_analysis("let first say believer ghost indeed know exist")


after process let first say believer ghost indeed know exist
vector for given sentence is [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 

# Below case is to understand above example with small Corpus

In [257]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer on training data (this should be done before prediction)
corpus = ["The cat sat on the mat", "The dog barked at the cat", "Cats and dogs are friends"]

cv_bow = CountVectorizer(max_features=5)
cv_bow.fit(corpus)  # Fit on the entire training corpus

def pre_process(txt):
    # Implement your pre-processing here (e.g., lowercasing, removing punctuation, etc.)
    txt = txt.lower()
    return txt

def predict_analysis(txt):
    # Step 1: Pre-process the input text
    txt = pre_process(txt)
    print(f"After processing: {txt}")

    # Step 2: Transform the new text into a vector using the fitted CountVectorizer
    X = cv_bow.transform([txt])
    vector = X.toarray()
    print(f"Vector: {vector}")

    # Step 3: Get the vocabulary (feature names)
    vocab = cv_bow.get_feature_names_out()
    print(f"Vocabulary: {vocab}")

    # If you have a prediction model (e.g., Naive Bayes)
    # here we can start using ML Models for predictions similar to above IMDB Dataset example


* As we had mentioned max_features =5 so vector dimension will be 5 which takes most repeating words of top 5 from given corpus while doing train

In [258]:
predict_analysis("Cat is a pet") # Note: in this small example, we did not apply much pre processing steps eg: stop words etc

After processing: cat is a pet
Vector: [[0 0 0 1 0]]
Vocabulary: ['and' 'are' 'at' 'cat' 'the']
