In [1]:
import numpy as np 
import pandas as pd

In [186]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [187]:
print(df['review'][0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

# Text Cleaning
### 1. Sample 10000 words
### 2. Remove html tags
### 3. Remove Special characters
### 4. Convereting every thing to lower case
### 5. Removing Stop words(words like and,or,the,from,...)
### 6. Stemming (basic version of same words like play,playing,played to 'play')

In [188]:
df = df.sample(10000)

In [189]:
df.head()

Unnamed: 0,review,sentiment
26341,Gordon goes over the top in typical Full Moon ...,negative
8482,I have been reading the reviews for this movie...,negative
24071,Don't get the impression from other reviewers ...,negative
27013,Jamie Foxx absolutely IS Ray Charles. His perf...,positive
31700,This movie serves up every imaginable Greek st...,negative


In [190]:
df.shape

(10000, 2)

In [191]:
  
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 26341 to 44892
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [192]:
df['sentiment'].replace({'positive':1,'negative':0},inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment'].replace({'positive':1,'negative':0},inplace = True)
  df['sentiment'].replace({'positive':1,'negative':0},inplace = True)


In [193]:
df.head()

Unnamed: 0,review,sentiment
26341,Gordon goes over the top in typical Full Moon ...,0
8482,I have been reading the reviews for this movie...,0
24071,Don't get the impression from other reviewers ...,0
27013,Jamie Foxx absolutely IS Ray Charles. His perf...,1
31700,This movie serves up every imaginable Greek st...,0


In [194]:
import re
clean = re.compile('<.*?>')
re.sub(clean,'',df.iloc[0].review)

"Gordon goes over the top in typical Full Moon fashion, but that's to be expected. Combs is surprisingly low-key, keeping his performance at a more realistic level than we are used to seeing. Also gone is the usual Stuart Gordon 'tongue-in-cheek' black humor.The film is quite effective in showing Combs' break down and his final heroic act to save his wife & daughter. You actually feel sympathy for his character, despite his short-comings.Personally, I was more surprised at the nudity and borderline porno sex scene, than I was shocked by the graphic violence & gore.Not classic Gordon, but certainly something you might enjoy if you've seen his more famous films."

In [195]:
# Function to clean html tags
def clean_html(text):
  clean = re.compile('<.*?>')
  return re.sub(clean,'',text)

In [196]:
df['review'] = df['review'].apply(clean_html)

In [197]:
# Converting everything to lowercase
def convert_lower(text):
  return text.lower()

In [198]:
df['review'] = df['review'].apply(convert_lower)

In [199]:
# Function to remove special characters

def remove_special(text):
  x= ''

  for i in text:
    if i.isalnum():
      x = x+i
    else:
      x = x + ' '
  return x

In [200]:
remove_special('hii i am om%mishra studying in * bsc.*i*t second year+_()@')

'hii i am om mishra studying in   bsc  i t second year     '

In [201]:
df['review'] = df['review'].apply(remove_special)

In [202]:
df.head(10)

Unnamed: 0,review,sentiment
26341,gordon goes over the top in typical full moon ...,0
8482,i have been reading the reviews for this movie...,0
24071,don t get the impression from other reviewers ...,0
27013,jamie foxx absolutely is ray charles his perf...,1
31700,this movie serves up every imaginable greek st...,0
8325,after reading more than my fair share of revie...,1
44100,i was one of about 200 people that was lucky e...,1
24211,i simply love this movie it is a perfect exam...,1
21972,the other day i showed my boyfriend a great mo...,0
44523,the cast alone tells you this will be a notch ...,1


In [203]:
# Removing the stop words
import nltk

In [None]:
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omm47\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [208]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()
stops = set(stopwords.words('english'))

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize into words
    words = word_tokenize(text)
    # Remove stopwords and short words
    words = [ps.stem(w) for w in words if w not in stops and len(w) > 1]
    # Join back into a string
    return ' '.join(words)

# Apply to your DataFrame
df['review'] = df['review'].apply(preprocess)


In [204]:
from nltk.corpus import stopwords

In [205]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [206]:
df.head()

Unnamed: 0,review,sentiment
26341,gordon goes over the top in typical full moon ...,0
8482,i have been reading the reviews for this movie...,0
24071,don t get the impression from other reviewers ...,0
27013,jamie foxx absolutely is ray charles his perf...,1
31700,this movie serves up every imaginable greek st...,0


In [207]:
# Function to remove stop words
stops = set(stopwords.words('english'))
def remove_stopwords(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Split into tokens
    words = text.split()
    # Remove stopwords
    filtered_words = [word for word in words if word not in stops]
    # Join back into a string
    return ' '.join(filtered_words)

In [158]:
df['review'] = df['review'].apply(remove_stopwords)

In [159]:
df.head(10)

Unnamed: 0,review,sentiment
23559,cousins watched movie ever since little know e...,1
24131,stan laurel oliver hardy famous comedy duo his...,0
16358,film stinks limburger cheese find garage sale ...,0
23580,admit thought movie going good soon changed mi...,1
39756,love watching jerry much rest world poor excus...,0
12340,remember seeing trailer movie first released l...,1
38498,usual leader leo gorcey slip mahoney bowery bo...,0
29735,question family go new house get stalked demon...,0
7082,saw movie aired lifetime back never seen since...,1
13568,years madonna tried prove public eye act unfor...,0


In [160]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def stem_words(word_list):
    return [ps.stem(word) for word in word_list]
                        # or ' '.join(stemmed) if you want a string


In [161]:
df['review'] = df['review'].apply(stem_words)

In [162]:
df.head(10)

Unnamed: 0,review,sentiment
23559,"[c, o, u, s, i, n, s, , w, a, t, c, h, e, d, ...",1
24131,"[s, t, a, n, , l, a, u, r, e, l, , o, l, i, ...",0
16358,"[f, i, l, m, , s, t, i, n, k, s, , l, i, m, ...",0
23580,"[a, d, m, i, t, , t, h, o, u, g, h, t, , m, ...",1
39756,"[l, o, v, e, , w, a, t, c, h, i, n, g, , j, ...",0
12340,"[r, e, m, e, m, b, e, r, , s, e, e, i, n, g, ...",1
38498,"[u, s, u, a, l, , l, e, a, d, e, r, , l, e, ...",0
29735,"[q, u, e, s, t, i, o, n, , f, a, m, i, l, y, ...",0
7082,"[s, a, w, , m, o, v, i, e, , a, i, r, e, d, ...",1
13568,"[y, e, a, r, s, , m, a, d, o, n, n, a, , t, ...",0


In [163]:
# Join Back
def join_back(list_input):
    return ' '.join(list_input)

In [164]:
df['review'] = df['review'].apply(join_back)

In [165]:
df.head(10)

Unnamed: 0,review,sentiment
23559,c o u s i n s w a t c h e d m o v i e e ...,1
24131,s t a n l a u r e l o l i v e r h a r d ...,0
16358,f i l m s t i n k s l i m b u r g e r c ...,0
23580,a d m i t t h o u g h t m o v i e g o i ...,1
39756,l o v e w a t c h i n g j e r r y m u c ...,0
12340,r e m e m b e r s e e i n g t r a i l e r ...,1
38498,u s u a l l e a d e r l e o g o r c e y ...,0
29735,q u e s t i o n f a m i l y g o n e w ...,0
7082,s a w m o v i e a i r e d l i f e t i m ...,1
13568,y e a r s m a d o n n a t r i e d p r o ...,0


In [77]:
X = df.iloc[:,0:1].values

In [78]:
X.shape

(10000, 1)

In [209]:
df.head(20)

Unnamed: 0,review,sentiment
26341,gordon goe top typic full moon fashion expect ...,0
8482,read review movi wan na kill self wan na live ...,0
24071,get impress review film stink co ambival japan...,0
27013,jami foxx absolut ray charl perform simpli gen...,1
31700,movi serv everi imagin greek stereotyp one par...,0
8325,read fair share review vast number differ movi...,1
44100,one peopl lucki enough see earli sneak film st...,1
24211,simpli love movi perfect exampl well round sur...,1
21972,day show boyfriend great movi stand movi shown...,0
44523,cast alon tell notch usual italian western vet...,1


In [210]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 500)

In [211]:
cv = CountVectorizer(analyzer='char')  # treats each character as a token
X = cv.fit_transform(df['review']).toarray()

In [212]:
X.shape

(10000, 27)

In [213]:
y = df.iloc[:,-1].values

In [214]:
X[0]

array([65, 20,  5, 20,  7, 39, 11, 11, 13, 24,  1,  6, 19, 12, 18, 33, 12,
        1, 24, 24, 20, 14,  3,  3,  2,  4,  0])

In [215]:
y = df.iloc[:,-1].values

In [216]:
y.shape

(10000,)

In [217]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [218]:
X_train.shape

(8000, 27)

In [219]:
X_test.shape

(2000, 27)

In [220]:
y_train.shape

(8000,)

In [221]:
y_test.shape

(2000,)

In [222]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [223]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [224]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [225]:
y_test.shape

(2000,)

In [226]:
y_pred1.shape

(2000,)

In [227]:
from sklearn.metrics import accuracy_score

In [228]:
print('Guassian:',accuracy_score(y_test,y_pred1))
print('Mulitnomial:',accuracy_score(y_test,y_pred2))
print('Bernoulli:',accuracy_score(y_test,y_pred3))

Guassian: 0.512
Mulitnomial: 0.6065
Bernoulli: 0.544
