In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import os


In [2]:
input_path = "data/labeled_reviews.csv"
output_matrix_path = "backend/data/vectorized_data.pkl"
output_vectorizer_path = "backend/data/tfidf_vectorizer.pkl"


In [3]:
df = pd.read_csv(input_path)
print("Original columns:", df.columns.tolist())
if "cleaned_review" not in df.columns:
    raise ValueError("'cleaned_review' column is missing in the CSV file.")
    
    # Rename to 'reviews' for use in vectorization
df.rename(columns={"cleaned_review": "reviews"}, inplace=True)
    
print(" Data loaded and renamed to 'reviews'")
print(" Number of rows:", len(df))
display(df.head())


Original columns: ['Movie_N', 'cleaned_review', 'Stars', 'label']
 Data loaded and renamed to 'reviews'
 Number of rows: 5681


Unnamed: 0,Movie_N,reviews,Stars,label
0,The Old Guard,love concept execution not bad there some crin...,7,1
1,The Old Guard,there were just so many clichés cringe naive m...,5,1
2,The Old Guard,actors all good well thats story childlike dir...,5,1
3,The Old Guard,my mind kept referring back austin powers badd...,9,2
4,The Old Guard,london scenes end were just bizarre empty stre...,5,1


In [4]:
print("First 5 values in df['reviews']:\n", df["reviews"].head())
print("\nData type:", df["reviews"].dtype)
print("\nTotal reviews:", len(df))


First 5 values in df['reviews']:
 0    love concept execution not bad there some crin...
1    there were just so many clichés cringe naive m...
2    actors all good well thats story childlike dir...
3    my mind kept referring back austin powers badd...
4    london scenes end were just bizarre empty stre...
Name: reviews, dtype: object

Data type: object

Total reviews: 5681


In [5]:
# Convert to string, strip whitespace
df["reviews"] = df["reviews"].astype(str).str.strip()

# Remove empty strings and NaNs
df = df[df["reviews"].notnull()]        # Drop NaN
df = df[df["reviews"] != ""]            # Drop empty strings
df = df[df["reviews"].str.len() > 1]    # Drop too short

print("✅ Cleaned reviews:", len(df))
print("Example:", df["reviews"].iloc[0])


✅ Cleaned reviews: 5679
Example: love concept execution not bad there some cringy forced parts but music really annoys me doesnt fit tone film all


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  
X = vectorizer.fit_transform(df["reviews"])  # This should now work

joblib.dump(X, "models/vectorized_data.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")

print("✅ Vectorization complete and files saved.")


✅ Vectorization complete and files saved.


In [7]:
import joblib

# Load the vectorizer and vectorized data
loaded_vectorizer = joblib.load("models/tfidf_vectorizer.pkl")
X_loaded = joblib.load("models/vectorized_data.pkl")

# Inspect shapes
print("✅ Vectorizer vocabulary size:", len(loaded_vectorizer.vocabulary_))
print("✅ Vectorized matrix shape:", X_loaded.shape)


✅ Vectorizer vocabulary size: 5000
✅ Vectorized matrix shape: (5679, 5000)
