In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import re

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
review = df['review']
sentiment = df['sentiment']

In [7]:
review

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [8]:
gg = pd.DataFrame(df)

# Function to remove symbols from a text using regex
def remove_symbols(text):
    # Define a regex pattern to match any non-alphanumeric characters and symbols
    pattern = r"[^\w\s]"
    # Replace the matched symbols with an empty string (i.e., remove them)
    clean_text = re.sub(pattern, "", text)
    return clean_text

# Apply the remove_symbols function to the "Review" column
df["review"] = df["review"].apply(remove_symbols)

print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production br br The filmin...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically theres a family where a little boy J...  negative
4      Petter Matteis Love in the Time of Money is a ...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot bad dialogue bad acting idiotic direc...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  Im going to have to disagree with the previous...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['review'], df['sentiment'],
                                                   test_size=0.3, random_state=101)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [12]:
classification_model = Pipeline([('tfidf', TfidfVectorizer()),
                                ('RandomForestClassifier', RandomForestClassifier())])

In [13]:
classification_model.fit(x_train, y_train)

In [14]:
y_pred_train = classification_model.predict(x_train)
y_pred_test = classification_model.predict(x_test)

In [15]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [16]:
print(confusion_matrix(y_train, y_pred_train))
print("************************"*5)
print(confusion_matrix(y_test, y_pred_test))

[[17526     0]
 [    0 17474]]
************************************************************************************************************************
[[6328 1146]
 [1231 6295]]


In [17]:
print(classification_report(y_train, y_pred_train))
print("************************"*5)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     17526
    positive       1.00      1.00      1.00     17474

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000

************************************************************************************************************************
              precision    recall  f1-score   support

    negative       0.84      0.85      0.84      7474
    positive       0.85      0.84      0.84      7526

    accuracy                           0.84     15000
   macro avg       0.84      0.84      0.84     15000
weighted avg       0.84      0.84      0.84     15000



In [18]:
print("Training Accuracy", accuracy_score(y_train, y_pred_train))
print("************************"*5)
print("Test Accuracy", accuracy_score(y_test, y_pred_test))

Training Accuracy 1.0
************************************************************************************************************************
Test Accuracy 0.8415333333333334


In [41]:
test1 = ["waste of money"]
test2 = ["i wanted to leave after 1hour it was boring"]
test3 = ["great movie."]
test4 = ["i liked it very much"]
test5 = ["bad"]

In [42]:

print(classification_model.predict(test1))
print("*****************"*5)
print(classification_model.predict(test2))
print("*****************"*5)
print(classification_model.predict(test3))
print("*****************"*5)
print(classification_model.predict(test4))
print("*****************"*5)
print(classification_model.predict(test5))

['negative']
*************************************************************************************
['negative']
*************************************************************************************
['positive']
*************************************************************************************
['positive']
*************************************************************************************
['negative']


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
classification_model1 = Pipeline([('bag_of_words', CountVectorizer()),
                                ('RandomForestClassifier', RandomForestClassifier())])

In [22]:
classification_model1.fit(x_train, y_train)

In [23]:
y_pred_train = classification_model1.predict(x_train)
y_pred_test = classification_model1.predict(x_test)

In [24]:
print(confusion_matrix(y_train, y_pred_train))
print("************************"*5)
print(confusion_matrix(y_test, y_pred_test))

[[17526     0]
 [    0 17474]]
************************************************************************************************************************
[[6346 1128]
 [1082 6444]]


In [25]:
print(classification_report(y_train, y_pred_train))
print("************************"*5)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     17526
    positive       1.00      1.00      1.00     17474

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000

************************************************************************************************************************
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      7474
    positive       0.85      0.86      0.85      7526

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



In [26]:
print("Training Accuracy", accuracy_score(y_train, y_pred_train))
print("************************"*5)
print("Test Accuracy", accuracy_score(y_test, y_pred_test))

Training Accuracy 1.0
************************************************************************************************************************
Test Accuracy 0.8526666666666667


In [35]:
test1 = ["shittest movie ever"]
test2 = ["i wanted to leave after 1hour it was boring"]
test3 = ["great movie."]
test4 = ["i liked it very much"]
test5 = ["boring"]

In [36]:
print(classification_model1.predict(test1))
print("*****************"*5)
print(classification_model1.predict(test2))
print("*****************"*5)
print(classification_model1.predict(test3))
print("*****************"*5)
print(classification_model1.predict(test4))
print("*****************"*5)
print(classification_model1.predict(test5))

['negative']
*************************************************************************************
['negative']
*************************************************************************************
['positive']
*************************************************************************************
['positive']
*************************************************************************************
['negative']
