In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df = temp_df.iloc[:30000]

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [6]:
df['sentiment'].value_counts()

sentiment
positive    15015
negative    14985
Name: count, dtype: int64

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [9]:
df.duplicated().sum()

0

In [10]:
# Basic Preprocessing
# Remove tags
# lowercase
# remove stopwords

In [11]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [12]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...


True

In [13]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [14]:
def preprocess_review(review):
    # Remove HTML tags
    review = BeautifulSoup(review, "html.parser").get_text()
    
    # Remove non-alphabetic characters
    review = re.sub("[^a-zA-Z]", " ", review)
    
    # Convert to lowercase
    review = review.lower()
    
    # Tokenize the review
    words = review.split()
    
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join the words back into one string separated by space
    return " ".join(words)


In [17]:
# Apply preprocessing to the 'review' column
df['review'] = df['review'].apply(preprocess_review)

print(df[['review', 'sentiment']])


                                                  review sentiment
0      one reviewer mentioned watching oz episode hoo...  positive
1      wonderful little production filming technique ...  positive
2      thought wonderful way spend time hot summer we...  positive
3      basically family little boy jake think zombie ...  negative
4      petter mattei love time money visually stunnin...  positive
...                                                  ...       ...
29995  new york love finally make shore short story l...  positive
29996  movie make wish imdb would let vote zero one t...  negative
29997  space camp unfortunate luck planned around tim...  negative
29998  octavio paz mexican poet writer diplomat recei...  positive
29999  watched minute movie bewildered watched minute...  negative

[29854 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(preprocess_review)


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [20]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the 'cleaned_review' column
X = tfidf_vectorizer.fit_transform(df['review']).toarray()


In [21]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'sentiment' column
y = label_encoder.fit_transform(df['sentiment'])


In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# Initialize and train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb_classifier.predict(X_test)

# Evaluate the classifier
print("Naive Bayes Classifier Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Classifier Accuracy: 0.8516161446993803
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      2941
           1       0.85      0.86      0.85      3030

    accuracy                           0.85      5971
   macro avg       0.85      0.85      0.85      5971
weighted avg       0.85      0.85      0.85      5971



In [25]:
# Initialize and train Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the classifier
print("Random Forest Classifier Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Classifier Accuracy: 0.8392229107352203
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.85      0.84      2941
           1       0.85      0.83      0.84      3030

    accuracy                           0.84      5971
   macro avg       0.84      0.84      0.84      5971
weighted avg       0.84      0.84      0.84      5971

