In [2]:
# Core libraries for data handling and text preprocessing
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords


In [None]:
# Load the raw IMDB reviews dataset
df_reviews = pd.read_csv("data/raw/IMDB Dataset.csv")


In [None]:
# Reduce class imbalance by downsampling
# Keep more positive samples and fewer negative samples intentionally
df_positive = df_reviews[df_reviews['sentiment'] == 'positive'][:9000]
df_negative = df_reviews[df_reviews['sentiment'] == 'negative'][:1000]


In [None]:
# Combine positive and negative samples into a single dataset
df_reviews_imb = pd.concat([df_positive, df_negative])


In [None]:
# Verify the distribution of sentiment classes
df_reviews_imb.value_counts('sentiment')


In [None]:
# Function to clean raw review text
# Steps:
# 1. Convert to lowercase
# 2. Remove HTML tags
# 3. Remove non-alphabetic characters
# 4. Remove stopwords
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]
    return ' '.join(words)


In [None]:
# Apply text cleaning to the review column
df_reviews_imb['review'] = df_reviews_imb['review'].apply(clean_text)


In [None]:
# Save cleaned dataset for downstream feature extraction
df_reviews_imb.to_csv("data/processed/clean_reviews.csv", index=False)
