In [110]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import openpyxl
import nltk
from bs4 import BeautifulSoup
import sklearn


import warnings
warnings.filterwarnings('ignore')

In [92]:
os.listdir("../Datasets")

['1.py', 'IMDB_Dataset.xlsx']

In [93]:
imdb_df = pd.read_excel("../Datasets/IMDB_Dataset.xlsx")
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 1. Exploratory Data Analysis 

In [94]:
imdb_df.describe()

Unnamed: 0,review,sentiment
count,1114,1114
unique,1114,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,558


In [95]:
imdb_df['sentiment'].value_counts()

sentiment
positive    558
negative    556
Name: count, dtype: int64

In [96]:
# Text Normalization 
# 1. Removing Stop words

stop_words_list = nltk.corpus.stopwords.words('english')
stop_words_list[:3]


['i', 'me', 'my']

In [97]:
# 2. Removing html strips  and punctuations
import string

exclude_list = string.punctuation

def strip_html(text):
    bsp = BeautifulSoup(text,"html.parser")
    return bsp.get_text()

def remove_punc(text):
    for char in exclude_list:
        txt = text.replace(char,"")
    return txt


def remove_noise_data(text):
    text = strip_html(text)
    text = remove_punc(text)
    return text


In [98]:
imdb_df['review_clean'] = imdb_df['review'].apply(remove_noise_data)

In [99]:
imdb_df.head(10)

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."
5,"Probably my all-time favorite movie, a story o...",positive,"Probably my all-time favorite movie, a story o..."
6,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i...",negative,"This show was an amazing, fresh & innovative i..."
8,Encouraged by the positive comments about this...,negative,Encouraged by the positive comments about this...
9,If you like original gut wrenching laughter yo...,positive,If you like original gut wrenching laughter yo...


# Stemming 
    * Its process of reducing word to its word stem
    * eg: we have review as "eating,eat,eaten ",all these words refers to same so we no need to have all these words instead of that just have one word.


In [100]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def stem_review(text):
    # Initialize the Lemmatizer
    lt = WordNetLemmatizer()

    # Tokenize and lemmatize each word
    tokens = word_tokenize(text)
    lemmatized_tokens = [lt.lemmatize(word,pos='v') for word in tokens]

    text = ' '.join(lemmatized_tokens)
    return text



In [101]:
imdb_df['review_clean_v1'] = imdb_df['review_clean'].apply(stem_review)

In [102]:
imdb_df.head(10)

Unnamed: 0,review,sentiment,review_clean,review_clean_v1
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,One of the other reviewers have mention that a...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,A wonderful little production . The film techn...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,I think this be a wonderful way to spend time ...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,Basically there 's a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love in the Time of Money ...
5,"Probably my all-time favorite movie, a story o...",positive,"Probably my all-time favorite movie, a story o...","Probably my all-time favorite movie , a story ..."
6,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,I sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i...",negative,"This show was an amazing, fresh & innovative i...","This show be an amaze , fresh & innovative ide..."
8,Encouraged by the positive comments about this...,negative,Encouraged by the positive comments about this...,Encouraged by the positive comment about this ...
9,If you like original gut wrenching laughter yo...,positive,If you like original gut wrenching laughter yo...,If you like original gut wrench laughter you w...


## Removing Stop words

In [103]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer once
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def remove_stop_words(text):
    # Tokenize into sub words
    tokens = tokenizer.tokenize(text)

    # Remove stop words and punctuation in one go
    tokens =[token.strip() for token in tokens if token not in stop_words_list and token not in string.punctuation]

    # Reassemble the tokens into a single string
    text = ' '.join(tokens)
    
    return text

In [107]:
imdb_df['review_clean_v2'] = imdb_df['review_clean_v1'].apply(remove_stop_words)

In [108]:
imdb_df.head()

Unnamed: 0,review,sentiment,review_clean,review_clean_v1,review_clean_v2
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...,One of the other reviewers have mention that a...,one reviewers mention watch 1 oz episode hook ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,A wonderful little production . The film techn...,wonderful little production film technique una...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,I think this be a wonderful way to spend time ...,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...,Basically there 's a family where a little boy...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei 's `` Love in the Time of Money ...,pet ##ter matt ##ei love time money visually s...


# Lets start Embedding Models
## 1. Bag of words 

In [114]:
from sklearn.feature_extraction.text import CountVectorizer

def bow(text):
    cv = CountVectorizer()

    #  Fit the model and transform the documents to a Bag of Words representation
    bow = cv.fit_transform([text]) #Input should be iterable so we making it as list

    # Convert the sparse matrix to an array and return it
    return bow.toarray()