In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load preprocessed training dataset
df_train = pd.read_csv("final_training_data.csv")

# Define Features & Target
features = ["startYear", "runtimeMinutes", "numVotes", "director_id", "writer_id"]
X = df_train[features]
y = df_train["label"]  # Only train data has labels

# **NEW: Split training data into train (80%) and validation (20%)**
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing Pipeline (same for all models)
numeric_features = ["startYear", "runtimeMinutes", "numVotes"]
categorical_features = ["director_id", "writer_id"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Define models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "SVM": SVC(kernel="linear", probability=True),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Store results
model_results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\n🔹 Training {name}...")

    # Create pipeline with preprocessing + model
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])

    # Train model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_train_pred = pipeline.predict(X_train)
    y_val_pred = pipeline.predict(X_val)

    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    print(f"✅ {name} Training Accuracy: {train_accuracy:.4f}")
    print(f"✅ {name} Validation Accuracy: {val_accuracy:.4f}")
    print(f"📊 Classification Report for {name}:\n", classification_report(y_val, y_val_pred))

    # Save results
    model_results[name] = {
        "train_accuracy": train_accuracy,
        "val_accuracy": val_accuracy,
        "classification_report": classification_report(y_val, y_val_pred)
    }

# Display the best model based on validation accuracy
best_model = max(model_results, key=lambda x: model_results[x]["val_accuracy"])
print(f"\n🎯 Best Model Based on Validation Accuracy: {best_model}")
print(model_results[best_model]["classification_report"])



🔹 Training Logistic Regression...
✅ Logistic Regression Training Accuracy: 0.9296
✅ Logistic Regression Validation Accuracy: 0.7670
📊 Classification Report for Logistic Regression:
               precision    recall  f1-score   support

       False       0.75      0.81      0.78       820
        True       0.78      0.72      0.75       772

    accuracy                           0.77      1592
   macro avg       0.77      0.77      0.77      1592
weighted avg       0.77      0.77      0.77      1592


🔹 Training Random Forest...
✅ Random Forest Training Accuracy: 1.0000
✅ Random Forest Validation Accuracy: 0.7619
📊 Classification Report for Random Forest:
               precision    recall  f1-score   support

       False       0.76      0.80      0.77       820
        True       0.77      0.73      0.75       772

    accuracy                           0.76      1592
   macro avg       0.76      0.76      0.76      1592
weighted avg       0.76      0.76      0.76      1592


🔹 T

Parameters: { "use_label_encoder" } are not used.



✅ XGBoost Training Accuracy: 0.8371
✅ XGBoost Validation Accuracy: 0.7437
📊 Classification Report for XGBoost:
               precision    recall  f1-score   support

       False       0.74      0.79      0.76       820
        True       0.75      0.70      0.73       772

    accuracy                           0.74      1592
   macro avg       0.74      0.74      0.74      1592
weighted avg       0.74      0.74      0.74      1592


🔹 Training SVM...
✅ SVM Training Accuracy: 0.9895
✅ SVM Validation Accuracy: 0.7764
📊 Classification Report for SVM:
               precision    recall  f1-score   support

       False       0.76      0.83      0.79       820
        True       0.80      0.72      0.76       772

    accuracy                           0.78      1592
   macro avg       0.78      0.77      0.78      1592
weighted avg       0.78      0.78      0.78      1592


🔹 Training Gradient Boosting...
✅ Gradient Boosting Training Accuracy: 0.7599
✅ Gradient Boosting Validation Accur