In [13]:
import pandas as pd
import numpy as np
import duckdb
import os
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import re
import duckdb
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm 

In [16]:
def preprocess_imdb_data(data_path, directors_path, writers_path):
    """
    General preprocessing pipeline for IMDB data with batch processing for BERT embeddings.
    
    Arguments:
    - data_path: Path to the train/test/validation data CSV file.
    - directors_path: Path to the directing.json file.
    - writers_path: Path to the writing.json file.
    
    Returns:
    - Cleaned Pandas DataFrame ready for model training or prediction.
    """
    
    # Step 1: Load main dataset
    df = pd.read_csv(data_path)

    # Step 2: Load JSON files (Directors & Writers)
    df_directors = pd.read_json(directors_path)
    df_writers = pd.read_json(writers_path)

    # Step 3: Rename columns for consistency
    df_directors.rename(columns={"movie": "tconst", "director": "director_id"}, inplace=True)
    df_writers.rename(columns={"movie": "tconst", "writer": "writer_id"}, inplace=True)

    # Step 4: Convert nested JSON fields into strings (fixes 'unhashable type' error)
    df_directors["director_id"] = df_directors["director_id"].astype(str)
    df_writers["writer_id"] = df_writers["writer_id"].astype(str)

    # Step 5: Merge main dataset with Directors & Writers using DuckDB
    import duckdb
    con = duckdb.connect()
    con.register("movies", df)
    con.register("directors", df_directors)
    con.register("writers", df_writers)

    query = """
    SELECT 
        movies.*, 
        directors.director_id, 
        writers.writer_id
    FROM movies
    LEFT JOIN directors ON movies.tconst = directors.tconst
    LEFT JOIN writers ON movies.tconst = writers.tconst
    """

    df = con.execute(query).fetchdf()
    con.close()

    # Step 6: Drop unnecessary columns
    columns_to_drop = ["originalTitle", "endYear", "Unnamed: 0"]
    df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

    # Step 7: Handle missing values
    numeric_columns = ["startYear", "runtimeMinutes", "numVotes"]
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors="coerce")  # Ensure numeric format
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())  # Fill missing with median

    df["director_id"] = df["director_id"].fillna("unknown")
    df["writer_id"] = df["writer_id"].fillna("unknown")

    # Step 8: Ensure correct data types
    df["startYear"] = df["startYear"].astype(int)
    df["runtimeMinutes"] = df["runtimeMinutes"].astype(int)
    df["numVotes"] = df["numVotes"].astype(int)

    # Step 9: Ensure each `tconst` is unique
    df = df.groupby("tconst").first().reset_index()

    ### ----------------------- ADDING WORD COUNT & BERT EMBEDDINGS ----------------------- ###

    # Function to clean movie titles (but KEEP stop words)
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text

    # Apply text cleaning (NO stopword removal)
    df["clean_title"] = df["primaryTitle"].astype(str).apply(clean_text)

    # Count words in each title
    df["word_count"] = df["clean_title"].apply(lambda x: len(x.split()))

    # Load BERT tokenizer & model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Function to get BERT embeddings in batches
    def get_bert_embeddings_batch(texts, batch_size=32):
        """
        Encodes text in batches using BERT to speed up processing.
        """
        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Processing BERT Embeddings"):
            batch = texts[i:i+batch_size]
            tokens = tokenizer(batch, padding=True, truncation=True, max_length=10, return_tensors="pt")
            tokens = {key: val.to(device) for key, val in tokens.items()}  # Move tensors to GPU if available

            with torch.no_grad():
                output = model(**tokens)

            batch_embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()  # Move back to CPU
            embeddings.extend(batch_embeddings)

        return embeddings

    # Process movie titles in batches
    df["bert_embedding"] = get_bert_embeddings_batch(df["clean_title"].tolist(), batch_size=32)

    ### ----------------------- RETURN FINAL PROCESSED DATASET ----------------------- ###
    return df

In [17]:
# Define file paths
# Define the base directory
base_data_dir = os.path.join(os.getcwd(), "imdb")

# Generate the list of train file paths
train_files = [os.path.join(base_data_dir, f) for f in os.listdir(base_data_dir) if f.startswith("train-") and f.endswith(".csv")]

# Define paths for directors and writers files
directors_path = os.path.join(base_data_dir, "directing.json")
writers_path = os.path.join(base_data_dir, "writing.json")

# Load JSON files (Directors & Writers)
df_directors = pd.read_json(directors_path)
df_writers = pd.read_json(writers_path)

# Preprocess and merge all training data
df_train = pd.concat([preprocess_imdb_data(file, directors_path, writers_path) for file in train_files], ignore_index=True)

# Preprocess validation and test data
df_val = preprocess_imdb_data(os.path.join(base_data_dir, "validation_hidden.csv"), directors_path, writers_path)
df_test = preprocess_imdb_data(os.path.join(base_data_dir, "test_hidden.csv"), directors_path, writers_path)

# Save cleaned datasets
df_train.to_csv("cleaned/final_training_dataTitleFeatures.csv", index=False)
df_val.to_csv("cleaned/final_validation_dataTitleFeatures.csv", index=False)
df_test.to_csv("cleaned/final_test_dataTitle.csv", index=False)

print("\n✅ All datasets have been preprocessed and saved!")


Processing BERT Embeddings: 100%|██████████████| 31/31 [00:28<00:00,  1.09it/s]
Processing BERT Embeddings:   6%|▉              | 2/32 [00:02<00:30,  1.02s/it]


KeyboardInterrupt: 

In [None]:
# Define Features & Target
features = ["startYear", "runtimeMinutes", "numVotes", "director_id", "writer_id"]
X_train = df_train[features]
y_train = df_train["label"]
X_val = df_val[features]
X_test = df_test[features]

# Preprocessing Pipeline
numeric_features = ["startYear", "runtimeMinutes", "numVotes"]
categorical_features = ["director_id", "writer_id"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Train Logistic Regression Model
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

print("🔹 Training model on full training data...")
model.fit(X_train, y_train)

# Generate Predictions
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Save predictions in required format (no headers, single column)
pd.DataFrame(y_val_pred).to_csv("submissions/validation_predictions_TitleFeatures.csv", index=False, header=False)
pd.DataFrame(y_test_pred).to_csv("submissions/test_predictions_TitleFeatures.csv", index=False, header=False)

print("✅ Predictions saved for submission!")