In [1]:
import pandas as pd
import numpy as np
import duckdb
import os
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import re
import duckdb
import torch
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
from tqdm import tqdm 
import unicodedata

In [3]:
def preprocess_imdb_data(data_path, directors_path, writers_path):
    """
    General preprocessing pipeline for IMDB data with batch processing for BERT embeddings.
    
    Arguments:
    - data_path: Path to the train/test/validation data CSV file.
    - directors_path: Path to the directing.json file.
    - writers_path: Path to the writing.json file.
    
    Returns:
    - Cleaned Pandas DataFrame ready for model training or prediction.
    """
    
    # Step 1: Load main dataset
    df = pd.read_csv(data_path)

    # Step 2: Load JSON files (Directors & Writers)
    df_directors = pd.read_json(directors_path)
    df_writers = pd.read_json(writers_path)

    # Step 3: Rename columns for consistency
    df_directors.rename(columns={"movie": "tconst", "director": "director_id"}, inplace=True)
    df_writers.rename(columns={"movie": "tconst", "writer": "writer_id"}, inplace=True)

    # Step 4: Convert nested JSON fields into strings (fixes 'unhashable type' error)
    df_directors["director_id"] = df_directors["director_id"].astype(str)
    df_writers["writer_id"] = df_writers["writer_id"].astype(str)

    # Step 5: Merge main dataset with Directors & Writers using DuckDB
    import duckdb
    con = duckdb.connect()
    con.register("movies", df)
    con.register("directors", df_directors)
    con.register("writers", df_writers)

    query = """
    SELECT 
        movies.*, 
        directors.director_id, 
        writers.writer_id
    FROM movies
    LEFT JOIN directors ON movies.tconst = directors.tconst
    LEFT JOIN writers ON movies.tconst = writers.tconst
    """

    df = con.execute(query).fetchdf()
    con.close()

        # Step 6: Create column year from startYear and endYear
    df['startYear'] = df['startYear'].replace('\\N', np.nan).astype(float)
    df['endYear'] = df['endYear'].replace('\\N', np.nan).astype(float)
    df['Year'] = df['startYear'].fillna(df['endYear'])

    # Step 7: Clean title names
    def normalize_text(text):
        if pd.isna(text):  # Handle missing values
            return ""
        text = str(text)
        text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')  # Remove accents
        text = re.sub(r'[^\w\s]', '', text)  # Remove any remaining special characters
        return text.strip()

    def clean_titles(row):
        primary = row['primaryTitle'] if pd.notna(row['primaryTitle']) else ''
        original = row['originalTitle'] if pd.notna(row['originalTitle']) else ''

        # If primaryTitle is empty, replace it with originalTitle
        if not primary:
            primary = original

        # Normalize primaryTitle
        cleaned_title = normalize_text(primary)

        # If both titles are missing after normalization, return "Unknown Title"
        return cleaned_title if cleaned_title else "Unknown Title"

    df['primaryTitle'] = df.apply(clean_titles, axis=1)
    df.rename(columns={'primaryTitle': 'movieTitle'}, inplace=True)

    # Step 8: Drop unnecessary columns
    columns_to_drop = ["startYear", "endYear", "originalTitle", "Unnamed: 0"]
    df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

    # Step 9: Handle missing values
    numeric_columns = ["runtimeMinutes", "numVotes"]
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors="coerce")  # Ensure numeric format
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())  # Fill missing with median

    df["director_id"] = df["director_id"].fillna("unknown")
    df["writer_id"] = df["writer_id"].fillna("unknown")

    # Step 10: Ensure correct data types
    df["Year"] = df["Year"].astype(int)
    df["runtimeMinutes"] = df["runtimeMinutes"].astype(int)
    df["numVotes"] = df["numVotes"].astype(int)
    df["movieTitle"] = df["movieTitle"].astype(str)

    # Step 11: Ensure each `tconst` is unique
    df = df.groupby("tconst").first().reset_index()

    # Step 12: Count words in each title
    df["word_count"] = df["movieTitle"].apply(lambda x: len(x.split()))

    return df

In [4]:
# Define file paths
# Define the base directory
base_data_dir = os.path.join(os.getcwd(), "imdb")

# Generate the list of train file paths
train_files = [os.path.join(base_data_dir, f) for f in os.listdir(base_data_dir) if f.startswith("train-") and f.endswith(".csv")]

# Define paths for directors and writers files
directors_path = os.path.join(base_data_dir, "directing.json")
writers_path = os.path.join(base_data_dir, "writing.json")

# Load JSON files (Directors & Writers)
df_directors = pd.read_json(directors_path)
df_writers = pd.read_json(writers_path)

# Preprocess and merge all training data
df_train = pd.concat([preprocess_imdb_data(file, directors_path, writers_path) for file in train_files], ignore_index=True)

# Preprocess validation and test data
df_val = preprocess_imdb_data(os.path.join(base_data_dir, "validation_hidden.csv"), directors_path, writers_path)
df_test = preprocess_imdb_data(os.path.join(base_data_dir, "test_hidden.csv"), directors_path, writers_path)

# Save cleaned datasets
df_train.to_csv("cleaned/final_training_dataTitleFeatures.csv", index=False)
df_val.to_csv("cleaned/final_validation_dataTitleFeatures.csv", index=False)
df_test.to_csv("cleaned/final_test_dataTitle.csv", index=False)

print("\n✅ All datasets have been preprocessed and saved!")



✅ All datasets have been preprocessed and saved!


In [5]:
import pandas as pd
import numpy as np
import torch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

# Split df_train into train and validation (80/20 split)
df_train_small, df_val_small = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train["label"])

# Define Features (NO BERT EMBEDDINGS, ONLY STRUCTURED DATA)
features = ["Year", "runtimeMinutes", "numVotes", "director_id", "writer_id", "word_count"]
X_train_small = df_train_small[features]
X_val_small = df_val_small[features]
y_train_small = df_train_small["label"]
y_val_small = df_val_small["label"]

# ✅ Check if all features exist
print("Existing features in X_train_small:", X_train_small.columns.tolist())

# Preprocessing Pipeline
numeric_features = ["Year", "runtimeMinutes", "numVotes"]
categorical_features = ["director_id", "writer_id"]

# ✅ Fix: Handle unknown categories in `OrdinalEncoder`
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_features)  # ✅ Fix applied
    ]
)

# Train RandomForest Model
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

print("🔹 Training model on 80% of train data...")
model.fit(X_train_small, y_train_small)

# Generate Predictions on Validation Set
y_val_pred = model.predict(X_val_small)

# Compute Evaluation Metrics
accuracy = accuracy_score(y_val_small, y_val_pred)
precision = precision_score(y_val_small, y_val_pred, average="binary")
recall = recall_score(y_val_small, y_val_pred, average="binary")
f1 = f1_score(y_val_small, y_val_pred, average="binary")

# Print Results
print("\n📊 Model Evaluation on Train Data Split (80/20)")
print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall: {recall:.4f}")
print(f"✅ F1 Score: {f1:.4f}")


Existing features in X_train_small: ['Year', 'runtimeMinutes', 'numVotes', 'director_id', 'writer_id', 'word_count']
🔹 Training model on 80% of train data...

📊 Model Evaluation on Train Data Split (80/20)
✅ Accuracy: 0.5496
✅ Precision: 0.5287
✅ Recall: 0.9336
✅ F1 Score: 0.6751


In [4]:
# Define file paths
# Define the base directory
base_data_dir = os.path.join(os.getcwd(), "imdb")

# Generate the list of train file paths
train_files = [os.path.join(base_data_dir, f) for f in os.listdir(base_data_dir) if f.startswith("train-") and f.endswith(".csv")]

# Define paths for directors and writers files
directors_path = os.path.join(base_data_dir, "directing.json")
writers_path = os.path.join(base_data_dir, "writing.json")

# Load JSON files (Directors & Writers)
df_directors = pd.read_json(directors_path)
df_writers = pd.read_json(writers_path)

# Preprocess and merge all training data
df_train = pd.concat([preprocess_imdb_data(file, directors_path, writers_path) for file in train_files], ignore_index=True)

# Preprocess validation and test data
df_val = preprocess_imdb_data(os.path.join(base_data_dir, "validation_hidden.csv"), directors_path, writers_path)
df_test = preprocess_imdb_data(os.path.join(base_data_dir, "test_hidden.csv"), directors_path, writers_path)

# Save cleaned datasets
df_train.to_csv("cleaned/final_training_dataTitleFeatures.csv", index=False)
df_val.to_csv("cleaned/final_validation_dataTitleFeatures.csv", index=False)
df_test.to_csv("cleaned/final_test_dataTitle.csv", index=False)

print("\n✅ All datasets have been preprocessed and saved!")


Processing DistilBERT Embeddings: 100%|████████| 31/31 [00:10<00:00,  2.88it/s]
Processing DistilBERT Embeddings: 100%|████████| 32/32 [00:08<00:00,  3.92it/s]
Processing DistilBERT Embeddings: 100%|████████| 30/30 [00:07<00:00,  3.86it/s]
Processing DistilBERT Embeddings: 100%|████████| 33/33 [00:14<00:00,  2.24it/s]
Processing DistilBERT Embeddings: 100%|████████| 31/31 [00:08<00:00,  3.73it/s]
Processing DistilBERT Embeddings: 100%|████████| 32/32 [00:08<00:00,  3.84it/s]
Processing DistilBERT Embeddings: 100%|████████| 33/33 [00:08<00:00,  3.70it/s]
Processing DistilBERT Embeddings: 100%|████████| 32/32 [00:08<00:00,  3.66it/s]
Processing DistilBERT Embeddings: 100%|████████| 30/30 [00:08<00:00,  3.68it/s]
Processing DistilBERT Embeddings: 100%|████████| 34/34 [00:09<00:00,  3.66it/s]



✅ All datasets have been preprocessed and saved!


In [14]:
import pandas as pd
import numpy as np
import torch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

# # Define Features & Target
# bert_columns = [f"bert_{i}" for i in range(df_train["distilbert_embedding"][0].shape[0])]

# # Expand BERT embeddings
# bert_train = pd.DataFrame(df_train["distilbert_embedding"].tolist(), columns=bert_columns)
# bert_val = pd.DataFrame(df_val["distilbert_embedding"].tolist(), columns=bert_columns)
# bert_test = pd.DataFrame(df_test["distilbert_embedding"].tolist(), columns=bert_columns)

# # Reset index
# bert_train.index = df_train.index
# bert_val.index = df_val.index
# bert_test.index = df_test.index

# # Merge expanded embeddings
# df_train = pd.concat([df_train, bert_train], axis=1)
# df_val = pd.concat([df_val, bert_val], axis=1)
# df_test = pd.concat([df_test, bert_test], axis=1)

# Define features
features = ["Year", "runtimeMinutes", "numVotes", "director_id", "writer_id", "word_count"]
X_train = df_train[features]
y_train = df_train["label"]
X_val = df_val[features]
X_test = df_test[features]

# Preprocessing Pipeline
numeric_features = ["Year", "runtimeMinutes", "numVotes"]
categorical_features = ["director_id", "writer_id"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OrdinalEncoder(), categorical_features)
    ]
)

# Train RandomForest Model
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

print("🔹 Training model on full training data...")
model.fit(X_train, y_train)

# Generate Predictions
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Save predictions in required format (no headers, single column)
pd.DataFrame(y_val_pred).to_csv("submissions/validation_predictions_TitleFeatures.csv", index=False, header=False)
pd.DataFrame(y_test_pred).to_csv("submissions/test_predictions_TitleFeatures.csv", index=False, header=False)

print("✅ Predictions saved for submission!")

🔹 Training model on full training data...


ValueError: Found unknown categories ['nm0760196', 'nm0770234', 'nm2493041', 'nm0056030', 'nm0827854', 'nm0475852', 'nm0186091', 'nm0001692', 'nm0797869', 'nm0885108', 'nm0000578', 'nm0272568', 'nm1957032', 'nm0242757', 'nm1260607', 'nm0464180', 'nm2797828', 'nm1223294', 'nm0605266', 'nm0587518', 'nm1624755', 'nm0473329', 'nm1061993', 'nm1000159', 'nm0569891', 'nm2669989', 'nm3306459', 'nm5107883', 'nm6016411', 'nm0431172', 'nm0079387', 'nm1811747', 'nm0718469', 'nm0648740', 'nm0110564', 'nm0533666', 'nm0998110', 'nm0639858', 'nm0423646', 'nm1048540', 'nm8381668', 'nm0522028', 'nm3201804', 'nm4033941', 'nm4980768', 'nm0941262', 'nm1242277', 'nm3155648', 'nm0005197', 'nm0294457', 'nm0801885', 'nm1868917', 'nm1473037', 'nm1936947', 'nm1361273', 'nm1804614', 'nm0762517', 'nm1970598', 'nm0103971', 'nm0508998', 'nm2655364', 'nm0526019', 'nm0446819', 'nm0001774', 'nm0672769', 'nm2258148', 'nm0899121', 'nm0075263', 'nm0394233', 'nm0411259', 'nm3052903', 'nm6174515', 'nm4909815', 'nm3184677', 'nm2746751', 'nm0438089', 'nm0442646', 'nm0648464', 'nm0002225', 'nm1637832', 'nm0495898', 'nm0960253', 'nm0815044', 'nm0607422', 'nm0213726', 'nm1025280', 'nm5068576', 'nm3226283', 'nm2751807', 'nm0112409', 'nm2165987', 'nm0769874', 'nm1427149', 'nm0310597', 'nm1207166', 'nm0453579', 'nm7978876', 'nm0440913', 'nm0744107', 'nm0043214', 'nm0395105', 'nm0688361', 'nm3407375', 'nm0934863', 'nm0744985', 'nm0084114', 'nm1240647', 'nm0542752', 'nm0924429', 'nm0462277', 'nm0845618', 'nm0289800', 'nm0330456', 'nm0389514', 'nm1927689', 'nm0449868', 'nm0653768', 'nm0453115', 'nm0905592', 'nm0586841', 'nm0100522', 'nm0286790', 'nm0699095', 'nm2988421', 'nm4171068', 'nm0602452', 'nm2473806', 'nm1912285', 'nm0660453', 'nm0868871', 'nm0000045', 'nm6512883', 'nm0040575', 'nm1738698', 'nm2269581', 'nm3818919', 'nm1583960', 'nm0373154', 'nm0115269', 'nm4335588', 'nm0013972', 'nm0950620', 'nm1491597', 'nm10470070', 'nm0712444', 'nm3147478', 'nm0909429', 'nm0074723', 'nm1028575', 'nm8784283', 'nm0518363', 'nm0001408', 'nm0732106', 'nm0774143', 'nm0698781', 'nm0076779', 'nm0925482', 'nm0491522', 'nm0775097', 'nm1733624', 'nm0780643', 'nm0559564', 'nm1497931', 'nm0522604', 'nm1630654', 'nm0499215', 'nm7355036', 'nm0511393', 'nm1644484', 'nm8399396', 'nm1104118', 'nm0807312', 'nm1009775', 'nm0000104', 'nm0001471', 'nm0106928', 'nm0381450', 'nm0491708', 'nm0665163', 'nm0425894', 'nm0381478', 'nm1154184', 'nm4756103', 'nm0236226', 'nm1466818', 'nm1913201', 'nm0379797', 'nm1146668', 'nm0164999', 'nm0366004', 'nm0725345', 'nm0629580', 'nm0006639', 'nm0869857', 'nm0953216', 'nm1215448', 'nm4183075', 'nm1190532', 'nm2171755', 'nm0466349', 'nm1154600', 'nm0281869', 'nm1783265', 'nm0672704', 'nm1079001', 'nm0071611', 'nm5832876', 'nm0690677', 'nm0618405', 'nm0641007', 'nm0397565', 'nm0826814', 'nm1317258', 'nm0210218', 'nm0523154', 'nm0427037', 'nm2719270', 'nm0364252', 'nm0945412', 'nm2264935', 'nm1027891', 'nm0786919', 'nm1062423', 'nm0014960', 'nm2403746', 'nm0215085', 'nm0217803', 'nm0089176', 'nm0504641', 'nm6304969', 'nm0064593', 'nm1206844', 'nm0056527', 'nm0359081', 'nm0013288', 'nm1119099', 'nm0139111', 'nm0836715', 'nm0350455', 'nm1577772', 'nm0562266', 'nm0258418', 'nm0317411', 'nm3640095', 'nm2692858', 'nm0504654', 'nm2165634', 'nm7618090', 'nm1145617', 'nm0575389', 'nm1279573', 'nm1293863', 'nm0382779', 'nm1077277', 'nm1561205', 'nm0892400', 'nm0017004', 'nm0906476', 'nm4364444', 'nm2325602', 'nm0406687', 'nm0229694', 'nm0162979', 'nm8011325', 'nm0468882', 'nm1231757', 'nm0839268', 'nm0918041', 'nm0086144', 'nm0580727', 'nm2975950', 'nm0049335', 'nm1362893', 'nm1840195', 'nm0568478', 'nm2348718', 'nm0245361', 'nm0187834', 'nm0769703', 'nm4032740', 'nm1392439', 'nm0954500', 'nm5824384', 'nm1262599', 'nm7916459', 'nm0168892', 'nm3255161', 'nm0779011', 'nm0514816', 'nm0170504', 'nm3616958', 'nm0683578', 'nm0196930', 'nm0233050', 'nm1558526', 'nm0412220', 'nm0009053', 'nm1371451', 'nm1685252', 'nm1204790', 'nm0030612', 'nm0340244', 'nm9214307', 'nm0404606', 'nm0178876', 'nm7552433', 'nm0352524', 'nm5617646', 'nm0663489', 'nm0373612', 'nm0063581', 'nm0916859', 'nm0836708', 'nm0006939', 'nm2395339', 'nm0661919', 'nm2128335', 'nm4298518', 'nm0001490', 'nm0481195', 'nm3936005', 'nm0768959', 'nm1396593', 'nm0773108', 'nm0292134', 'nm0645661', 'nm3192191', 'nm0344144', 'nm6921776', 'nm1902132', 'nm0848992', 'nm1220246', 'nm2085752', 'nm0228013', 'nm1434014', 'nm0915304', 'nm0622288', 'nm1259604', 'nm0861899', 'nm0025399', 'nm0465551', 'nm1026530', 'nm0734466', 'nm0446059', 'nm0770647', 'nm0006853', 'nm0566671', 'nm2141317', 'nm0493615', 'nm0051156', 'nm0262402', 'nm0561938', 'nm0046480', 'nm0009942', 'nm3010986', 'nm0753382', 'nm0359880', 'nm0950000', 'nm0719131', 'nm0782900', 'nm4312760', 'nm1229323', 'nm0455839', 'nm0198014', 'nm0320784', 'nm0956022', 'nm1150959', 'nm0515101', 'nm0324013', 'nm0264236', 'nm0001901', 'nm0006959', 'nm0645516', 'nm1141070', 'nm0697935', 'nm0997841', 'nm0761522', 'nm0847294', 'nm3009132', 'nm5643041', 'nm1226069', 'nm0004056', 'nm1061623', 'nm1371524', 'nm2350892', 'nm0192478', 'nm0060347', 'nm1160071', 'nm3268751', 'nm4562662', 'nm0190859', 'nm0124112', 'nm5050824', 'nm9511013', 'nm0720000', 'nm0919012', 'nm2999308', 'nm0743555', 'nm0367431', 'nm0003080', 'nm3228270', 'nm0068385', 'nm0322515', 'nm0464123', 'nm0517597', 'nm1130275', 'nm3231469', 'nm0325175', 'nm0463596', 'nm1165576', 'nm5449600', 'nm0039468', 'nm0469823', 'nm0751481', 'nm0584485', 'nm6497799', 'nm0334353', 'nm0004111', 'nm0625379', 'nm8734870', 'nm0325776', 'nm0075644', 'nm2905562', 'nm1097114', 'nm0030735', 'nm0287836', 'nm0807421', 'nm0243659', 'nm2918981', 'nm3893748', 'nm0932992', 'nm0831457', 'nm0911642', 'nm0323670', 'nm3728575', 'nm0003116', 'nm0229544', 'nm0132257', 'nm1427994', 'nm11065997', 'nm2123102', 'nm0765761', 'nm1317966', 'nm0777593', 'nm0047028', 'nm7920254', 'nm3159714', 'nm8664327', 'nm0814734', 'nm0824882', 'nm0814716', 'nm0103855', 'nm0525782', 'nm0000797', 'nm0194274', 'nm0614682', 'nm12319791', 'nm0437802', 'nm0087503', 'nm3732772', 'nm5956405', 'nm0579580', 'nm1050748', 'nm2928920', 'nm0111735', 'nm3881964', 'nm0436284', 'nm1248193', 'nm0782947', 'nm0543032', 'nm0714940', 'nm0274659', 'nm1760744', 'nm2057169', 'nm0716814', 'nm7958316', 'nm0672492', 'nm0531751', 'nm1456816', 'nm0290371', 'nm3771227', 'nm0131969', 'nm2253160', 'nm1169248', 'nm0700335', 'nm0015328', 'nm0000698', 'nm12904118', 'nm0210701'] in column 0 during transform

In [13]:
import pandas as pd
import numpy as np
import torch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

# Split df_train into train and validation (80/20 split)
df_train_small, df_val_small = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train["label"])

# # Define Features
# bert_columns = [f"bert_{i}" for i in range(df_train["distilbert_embedding"][0].shape[0])]

# # Expand BERT embeddings
# bert_train_small = pd.DataFrame(df_train_small["distilbert_embedding"].tolist(), columns=bert_columns)
# bert_val_small = pd.DataFrame(df_val_small["distilbert_embedding"].tolist(), columns=bert_columns)

# # Reset index
# bert_train_small.index = df_train_small.index
# bert_val_small.index = df_val_small.index

# # Merge expanded embeddings
# df_train_small = pd.concat([df_train_small, bert_train_small], axis=1)
# df_val_small = pd.concat([df_val_small, bert_val_small], axis=1)

# Define Features & Target
features = ["Year", "runtimeMinutes", "numVotes", "director_id", "writer_id", "word_count"]
X_train_small = df_train_small[features]
y_train_small = df_train_small["label"]
X_val_small = df_val_small[features]
y_val_small = df_val_small["label"]

# ✅ Check if all features exist
print("Existing features in X_train_small:", X_train_small.columns.tolist())

# Preprocessing Pipeline
numeric_features = ["Year", "runtimeMinutes", "numVotes"]
categorical_features = ["director_id", "writer_id"]

# ✅ Fix: Handle unknown categories in `OrdinalEncoder`
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_features)  # ✅ Fix applied
    ]
)

# Train RandomForest Model
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

print("🔹 Training model on 80% of train data...")
model.fit(X_train_small, y_train_small)

# Generate Predictions on Validation Set
y_val_pred = model.predict(X_val_small)

# Compute Evaluation Metrics
accuracy = accuracy_score(y_val_small, y_val_pred)
precision = precision_score(y_val_small, y_val_pred, average="binary")
recall = recall_score(y_val_small, y_val_pred, average="binary")
f1 = f1_score(y_val_small, y_val_pred, average="binary")

# Print Results
print("\n📊 Model Evaluation on Train Data Split (80/20)")
print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall: {recall:.4f}")
print(f"✅ F1 Score: {f1:.4f}")


Existing features in X_train_small: ['Year', 'runtimeMinutes', 'numVotes', 'director_id', 'writer_id']
🔹 Training model on 80% of train data...

📊 Model Evaluation on Train Data Split (80/20)
✅ Accuracy: 0.5496
✅ Precision: 0.5287
✅ Recall: 0.9336
✅ F1 Score: 0.6751
