# 1. Import Required Libraries

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

In [26]:
# Download NLTK resources if not already available
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sidba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sidba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Step 1: Load the dataset 

In [4]:
data = pd.read_csv('IMDB Dataset.csv')

In [5]:
# Display the first few rows of the dataset
print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
# Check the structure and missing values in the dataset
print(data.info())
print(data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
review       0
sentiment    0
dtype: int64


# Step 2: Data Cleaning

In [27]:
# Remove HTML tags, special characters, and convert text to lowercase
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [28]:
data['review'] = data['review'].apply(clean_text)

In [29]:
# Display cleaned data
print(data.head())

                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production the filming tech...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically theres a family where a little boy j...  negative
4  petter matteis love in the time of money is a ...  positive


# Step 3: Tokenization

In [32]:
# Tokenize the cleaned reviews
data['tokens'] = data['review'].apply(word_tokenize)

In [33]:
# Display tokenized reviews
print(data[['review', 'tokens']].head())

                                              review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production the filming tech...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                              tokens  
0  [one, of, the, other, reviewers, has, mentione...  
1  [a, wonderful, little, production, the, filmin...  
2  [i, thought, this, was, a, wonderful, way, to,...  
3  [basically, theres, a, family, where, a, littl...  
4  [petter, matteis, love, in, the, time, of, mon...  


# Step 4: Remove Stopwords

In [34]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

data['tokens'] = data['tokens'].apply(remove_stopwords)


In [35]:
# Join tokens back to form sentences (optional, depending on model input requirements)
data['processed_review'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))

In [36]:
# Display processed reviews
print(data[['review', 'processed_review']].head())


                                              review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production the filming tech...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                    processed_review  
0  one reviewers mentioned watching oz episode yo...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


 # Step 5: TF-IDF Feature Extraction 

In [37]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  


In [38]:
# Apply TF-IDF to the processed reviews
X = tfidf_vectorizer.fit_transform(data['processed_review'])


In [39]:
# Extract labels (sentiments)
y = data['sentiment'].apply(lambda sentiment: 1 if sentiment == 'positive' else 0)


In [40]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# Display TF-IDF matrix shape
print("TF-IDF Matrix Shape:", X.shape)

TF-IDF Matrix Shape: (50000, 5000)


# Step 6: Save Preprocessed Data 


In [44]:
data.to_csv('cleaned_IMDB_Dataset_with_Tokens.csv', index=False)