In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("movie_reviews-1.csv")

# Identify text column
text_column = 'review_text'  # from your CSV
rating_column = 'rating'

# Keep a copy for summary
df_before = df.copy()

# -------------------------------
# 1. Clean text
# -------------------------------
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)             # remove HTML
    text = re.sub(r'[^a-z0-9\s]', '', text)       # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    return text

df['review_clean'] = df[text_column].apply(clean_text)

# -------------------------------
# 2. TF-IDF vectorization
# -------------------------------
tfidf = TfidfVectorizer(max_features=300)
review_tfidf = tfidf.fit_transform(df['review_clean']).toarray()
tfidf_feature_names = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame(review_tfidf, columns=tfidf_feature_names)

# -------------------------------
# 3. Handle missing ratings
# -------------------------------
median_rating = df[rating_column].median()
df[rating_column] = df[rating_column].fillna(median_rating)

# -------------------------------
# 4. Normalize ratings (0â€“1)
# -------------------------------
scaler = MinMaxScaler(feature_range=(0, 1))
df['rating_normalized'] = scaler.fit_transform(df[[rating_column]])

# -------------------------------
# 5. Final cleaned dataset
# -------------------------------
# Include review_id for reference
final_columns = ['review_id', 'review_clean', 'rating_normalized']
cleaned_df = pd.concat([df[final_columns].reset_index(drop=True), tfidf_df], axis=1)

# -------------------------------
# 6. Before vs After Summary
# -------------------------------
summary = pd.DataFrame({
    "Metric": [
        "Total Reviews",
        "Missing Ratings Before",
        "Missing Ratings After",
        "Review Text Avg Length Before",
        "Review Text Avg Length After"
    ],
    "Before": [
        len(df_before),
        df_before[rating_column].isna().sum(),
        0,
        int(df_before[text_column].astype(str).apply(len).mean()),
        None
    ],
    "After": [
        len(cleaned_df),
        None,
        df[rating_column].isna().sum(),
        None,
        int(df['review_clean'].astype(str).apply(len).mean())
    ]
})

# -------------------------------
# 7. Display results
# -------------------------------
print("\n====== BEFORE vs AFTER CLEANING REPORT ======")
print(summary)
print("\n====== CLEANED DATASET (HEAD) ======")
print(cleaned_df.head())



                          Metric  Before  After
0                  Total Reviews    15.0   15.0
1         Missing Ratings Before     2.0    NaN
2          Missing Ratings After     0.0    0.0
3  Review Text Avg Length Before    22.0    NaN
4   Review Text Avg Length After     NaN   16.0

   review_id          review_clean  rating_normalized   acting   amazing  \
0          1         amazing movie              0.750  0.00000  0.707107   
1          2  terrible acting plot              0.000  0.57735  0.000000   
2          3         amazing movie              0.750  0.00000  0.707107   
3          4  terrible acting plot              0.750  0.57735  0.000000   
4          5         amazing movie              0.375  0.00000  0.707107   

      movie     plot  terrible  
0  0.707107  0.00000   0.00000  
1  0.000000  0.57735   0.57735  
2  0.707107  0.00000   0.00000  
3  0.000000  0.57735   0.57735  
4  0.707107  0.00000   0.00000  
