In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# ---------------------------
# 1. Data Preprocessing + Feature Engineering
# ---------------------------
def add_features(df):
    df['Director'] = df['Director'].fillna('Unknown')
    df['Genre'] = df['Genre'].fillna('Unknown')
    df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

    if df['Duration'].isnull().sum() == len(df):
        df['Duration'] = 0
    else:
        df['Duration'] = df['Duration'].fillna(df['Duration'].median())

    df['Genre'] = df['Genre'].astype('category').cat.codes
    df['Director'] = df['Director'].astype('category').cat.codes

    df['Director_success'] = df.groupby('Director')['Rating'].transform('mean')
    df['Genre_success'] = df.groupby('Genre')['Rating'].transform('mean')

    return df

# ---------------------------
# 2. Load + Prepare Data
# ---------------------------
df = pd.read_csv(r'C:\Users\DELL\movie-rating-predictor\data\processed\movies_clean.csv', encoding='latin1')

df['Rating'] = df['Rating'].fillna(df['Rating'].median())

df = add_features(df)
X = df[['Duration', 'Genre', 'Director', 'Director_success', 'Genre_success']]
y = df['Rating']

# ---------------------------
# 3. K-Fold Cross Validation
# ---------------------------
model = GradientBoostingRegressor(random_state=42)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_root_mean_squared_error')

print("K-Fold RMSE Scores:", -scores)
print("Mean RMSE:", -scores.mean())

# ---------------------------
# 4. Hyperparameter Tuning
# ---------------------------
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5],
}

grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42),
                           param_grid, 
                           cv=5, 
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

grid_search.fit(X, y)

print("\n✅ Best Parameters:", grid_search.best_params_)
print("Best RMSE Score (GridSearchCV):", -grid_search.best_score_)

# Save the best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, r'C:\Users\DELL\movie-rating-predictor\models\movie_rating_model.pkl')

# ---------------------------
# 5. Feature Importance Plot
# ---------------------------
feature_names = X.columns
importances = best_model.feature_importances_

plt.figure(figsize=(8, 5))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# ---------------------------
# 6. SHAP (Optional)
# ---------------------------
explainer = shap.Explainer(best_model, X)
shap_values = explainer(X[:100])  # Use subset to speed up

shap.summary_plot(shap_values, X[:100], plot_type="bar")
shap.summary_plot(shap_values, X[:100])


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1062618991.py, line 77)