In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
df=pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df['review']=df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [6]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup= BeautifulSoup(text,"html.parser")
    return soup.get_text()

In [7]:
df['review']=df['review'].apply(remove_html_tags)

  soup= BeautifulSoup(text,"html.parser")


###### This is an important preprocessing step in NLP to clean the text data by removing unnecessary HTML elements.

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [9]:
pip install contractions

Note: you may need to restart the kernel to use updated packages.


In [10]:
import contractions
def expand_contractions(text):
    return contractions.fix(text)

In [11]:
df['review']=df['review'].apply(expand_contractions)

###### This cell installs the contractions library, which is used to expand contracted words (e.g., "don't" to "do not") in text data. This is another important preprocessing step in NLP to standardize the text.

In [12]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i am going to have to disagree with the previo...,negative


In [13]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('','',string.punctuation))

In [14]:
df['review']=df['review'].apply(remove_punctuation)

###### The function remove_punctuation is used to clean text data by stripping out all punctuation marks. This is useful in text preprocessing for tasks like sentiment analysis, where punctuation may not be relevant or could interfere with tokenization and other text processing steps.

In [15]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,petter matteis love in the time of money is a ...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i am going to have to disagree with the previo...,negative


In [16]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
def tokenize(text):
    return word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harsha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
df['review']=df['review'].apply(tokenize)

###### The tokenize function is used to break down a sentence or piece of text into individual words or tokens. This is a crucial step in NLP tasks, such as sentiment analysis, text classification, or any other form of text processing, as it allows the model to work with the smallest meaningful units of the text.

In [18]:
df

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",positive
1,"[a, wonderful, little, production, the, filmin...",positive
2,"[i, thought, this, was, a, wonderful, way, to,...",positive
3,"[basically, there, is, a, family, where, a, li...",negative
4,"[petter, matteis, love, in, the, time, of, mon...",positive
...,...,...
49995,"[i, thought, this, movie, did, a, down, right,...",positive
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...",negative
49997,"[i, am, a, catholic, taught, in, parochial, el...",negative
49998,"[i, am, going, to, have, to, disagree, with, t...",negative


In [19]:
from nltk.stem import PorterStemmer
stemmer= PorterStemmer()
def stem_words(words):
    stemmed_words=[stemmer.stem(word) for word in words]
    return stemmed_words

In [20]:
df['review']=df['review'].apply(stem_words)

###### The stem_words function is used to reduce each word in a list of words to its root form using the Porter Stemmer. Stemming is a common preprocessing step in NLP, especially in tasks like text classification or information retrieval, where it's beneficial to reduce words to their base forms to treat different inflections of the same word as equivalent.

In [21]:
df

Unnamed: 0,review,sentiment
0,"[one, of, the, other, review, ha, mention, tha...",positive
1,"[a, wonder, littl, product, the, film, techniq...",positive
2,"[i, thought, thi, wa, a, wonder, way, to, spen...",positive
3,"[basic, there, is, a, famili, where, a, littl,...",negative
4,"[petter, mattei, love, in, the, time, of, mone...",positive
...,...,...
49995,"[i, thought, thi, movi, did, a, down, right, g...",positive
49996,"[bad, plot, bad, dialogu, bad, act, idiot, dir...",negative
49997,"[i, am, a, cathol, taught, in, parochi, elemen...",negative
49998,"[i, am, go, to, have, to, disagre, with, the, ...",negative


In [22]:
print(df.loc[0][0])

['one', 'of', 'the', 'other', 'review', 'ha', 'mention', 'that', 'after', 'watch', 'just', '1', 'oz', 'episod', 'you', 'will', 'be', 'hook', 'they', 'are', 'right', 'as', 'thi', 'is', 'exactli', 'what', 'happen', 'with', 'meth', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'wa', 'it', 'brutal', 'and', 'unflinch', 'scene', 'of', 'violenc', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'thi', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'heart', 'or', 'timid', 'thi', 'show', 'pull', 'no', 'punch', 'with', 'regard', 'to', 'drug', 'sex', 'or', 'violenc', 'it', 'is', 'hardcor', 'in', 'the', 'classic', 'use', 'of', 'the', 'wordit', 'is', 'call', 'oz', 'as', 'that', 'is', 'the', 'nicknam', 'given', 'to', 'the', 'oswald', 'maximum', 'secur', 'state', 'penitentari', 'it', 'focus', 'mainli', 'on', 'emerald', 'citi', 'an', 'experiment', 'section', 'of', 'the', 'prison', 'where', 'all', 'the', 'cell', 'have', 'glass', 'front', 'and', 'face', 'inward', '

In [23]:
!pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()



###### The TF-IDF vectorizer converts text data into numerical values that represent the importance of each word in a document relative to a collection of documents. This is useful for understanding the relevance of words in documents, especially in tasks like text classification or clustering.

In [24]:
df['review']=df['review'].apply(lambda tokens: ' '.join(tokens))

In [25]:
vectorizer1=TfidfVectorizer(max_features=10000)

In [26]:
tfidf=vectorizer1.fit_transform(df['review'])

In [27]:
tfidf_df=pd.DataFrame(tfidf.toarray(),columns=vectorizer1.get_feature_names_out())

In [28]:
tfidf_df

Unnamed: 0,007,010,10,100,1000,10000,101,1010,11,110,...,zoe,zoey,zombi,zone,zoo,zoom,zorro,zu,zucco,zucker
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.072648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.104765,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
vectorizer1.get_feature_names_out()

array(['007', '010', '10', ..., 'zu', 'zucco', 'zucker'], dtype=object)

In [30]:
df_tfidf = pd.concat([df['sentiment'], tfidf_df.drop('sentiment',axis =1 )], axis=1)

In [31]:
df_tfidf

Unnamed: 0,sentiment,007,010,10,100,1000,10000,101,1010,11,...,zoe,zoey,zombi,zone,zoo,zoom,zorro,zu,zucco,zucker
0,positive,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,positive,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,positive,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,negative,0.0,0.0,0.072648,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.104765,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,positive,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,positive,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,negative,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,negative,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,negative,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
df_tfidf['sentiment']

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [33]:
df_tfidf['sentiment']=df_tfidf['sentiment'].apply(lambda x:1 if x == 'positive' else 0)

In [34]:
df_tfidf

Unnamed: 0,sentiment,007,010,10,100,1000,10000,101,1010,11,...,zoe,zoey,zombi,zone,zoo,zoom,zorro,zu,zucco,zucker
0,1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.072648,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.104765,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
X=df_tfidf.drop('sentiment',axis=1)
y=df_tfidf['sentiment']

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [37]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)

In [38]:
y_pred=rf.predict(X_test)

In [39]:
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
accuracy_score(y_test,y_pred)

0.8446

In [40]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Precision: 0.84
Recall: 0.84
F1 Score: 0.84
