In [4]:
#Installing initial libraries
import pandas as pd
import numpy as np
import os
from sklearn.base import BaseEstimator, TransformerMixin
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from imblearn.pipeline import Pipeline  
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, ConfusionMatrixDisplay, classification_report

In [5]:
#Load Dataframe
path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\data\processed\cleaned_df_no_outliers.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,name,imdb_id,year,ratings,vote_count,movie_duration,movie_certification,genre,log_vote_count,popularity_tier,avg_vote_per_genre,avg_vote_per_cert,avg_vote_per_year,decade,avg_vote_per_decade,movie_age,movie_certification_cat,avg_vote_per_cert_cat,movie_success
0,Superman,tt5950044,2025,7.2,305000,129,12A,Action,12.62807,Blockbuster,226230.987618,169157.62908,10678.337553,2020,30174.873313,0,Teens,118375.761017,1
1,One Battle After Another,tt30144839,2025,8.5,15000,161,15,Action,9.615872,Well-Known,226230.987618,84441.464933,10678.337553,2020,30174.873313,0,Mature_Audience,82746.035862,1
2,Demon Slayer: Kimetsu no Yaiba Infinity Castle,tt32820897,2025,8.6,37000,155,15,Action,10.5187,Blockbuster,226230.987618,84441.464933,10678.337553,2020,30174.873313,0,Mature_Audience,82746.035862,1
3,KPop Demon Hunters,tt14205554,2025,7.6,79000,95,PG,Action,11.277216,Blockbuster,226230.987618,57792.561823,10678.337553,2020,30174.873313,0,Parental_Guidance,56530.128619,1
4,F1,tt16311594,2025,7.8,214000,155,12A,Action,12.273736,Blockbuster,226230.987618,169157.62908,10678.337553,2020,30174.873313,0,Teens,118375.761017,1


In [6]:
#Note that because I am using a cleaned data, I do not need this datacleaner function, but if the data is not, then you may need it
#you can refer to the data_cleaning notebook to see the data cleaning process and maybe why
#Tranformer for data cleaning (I'd not include it in my pipeline)
#dependong on your data or what you want to do, you may need to drop duplicates with subset ="name" and/or dropna on ratings outside this function
#Or depending on what you want, you can use SimpleImputer() to put mean or median or mode, also, make_pipeline to include multiple pipelines
#I dropped na because the rows without values actually do not have any values on the site and they were very small i think 5 entries(not very significant)
#including function for popularity score and decade
#popularity score was a top feature importance for my regression model and it may be for your classification model too
#then I want to use a classifier to predict if a movie is successful or not based on its ratings, sucess is defined as rating >=7.0
#I will use Logistic Regression, you can try other classifiers too
class DataCleaner(BaseEstimator, TransformerMixin):
    #BaseEstimator allows you use functions like get_params, set_params, TransformerMixin allows fit and transform
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything from the data, so we just return self.
        return self

    def transform(self, X, y=None):
        # Make a copy to avoid changing the original data
        X_copy = X.copy()

        # --- Apply cleaning functions from your notebook ---

        # 1. Clean vote_count
        def clean_vote(val):
            if isinstance(val, str):
                val = val.lower().strip()
                if "k" in val:
                    return int(float(val.replace("k", "")) * 1000)
                elif "m" in val:
                    return int(float(val.replace("m", "")) * 1_000_000)
                else:
                    return int(float(val))
            return val
        
        if 'vote_count' in X_copy.columns:
            X_copy['vote_count'] = X_copy['vote_count'].apply(clean_vote)

        # 2. Clean movie_duration
        def clean_duration(val):
            if isinstance(val, str):
                hours = 0
                minutes = 0
                h_match = re.search(r"(\d+)\s*h", val.lower())
                if h_match:
                    hours = int(h_match.group(1))
                m_match = re.search(r"(\d+)\s*m", val.lower())
                if m_match:
                    minutes = int(m_match.group(1))
                return hours * 60 + minutes
            return val
            
        if 'movie_duration' in X_copy.columns:
            X_copy['movie_duration'] = X_copy['movie_duration'].apply(clean_duration).astype(float)

        # 3. Drop rows with zero movie_duration
        if 'movie_duration' in X_copy.columns:
             X_copy = X_copy[X_copy['movie_duration'] > 0]

        # 4. Drop rows with 'Metascore' in movie_certification
        if 'movie_certification' in X_copy.columns:
            X_copy = X_copy[~X_copy['movie_certification'].str.contains("Metascore", na=False)]
        
        # 5. Create popularity_score
        if "ratings" in X_copy.columns and "vote_count" in X_copy.columns:
            X_copy['popularity_score'] = X_copy['ratings'] * np.log1p(X_copy['vote_count'])
        
        # 6. Create decade feature
        if 'year' in X_copy.columns:
            X_copy['decade'] = (X_copy['year'] // 10) * 10
            
        return X_copy

In [7]:
#Create a new colum that divides movies into successful and not successful based on ratings, successful if ratings >=7.0
df["movie_success"] = np.where(df["ratings"] >= 7.0, 1, 0)

In [8]:
df["movie_success"].value_counts() #check the balance of the target variable and success is approximately 1/3 of the data


movie_success
0    10431
1     5087
Name: count, dtype: int64

In [9]:
#Drop outliers
df = df[df["movie_duration"] <= 300] #keeping movies with duration less than 300 minutes
df.to_csv(r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\data\processed\cleaned_df_no_outliers.csv", index=False)
#Define Features X and Y
X = df.drop(["ratings", "imdb_id","name", "movie_success"], axis=1)
y = df["movie_success"]

In [10]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42) 
#0.25 to set 25% of the data as test and random_state to make sure we get the same results every time we run the code 42 is just a number, could be any other number
#split the training set for validation during model training
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42) 
#0.2 to set 20% of the training data as validation data

In [11]:
#column groups for pipeline
num_features = ['year', 'vote_count', 'movie_duration', 'decade', "log_vote_count", 'avg_vote_per_genre', 'avg_vote_per_cert', 'avg_vote_per_year', 'avg_vote_per_decade', 'movie_age', 'avg_vote_per_cert_cat']
cat_features = ['genre', 'movie_certification', "movie_certification_cat", "avg_vote_per_cert_cat", "popularity_tier"]

In [12]:
#Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
    ("num", StandardScaler(), num_features), #scaler helps to avoid large numbers that can skew the data
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    remainder="passthrough" #this will keep the other columns that are not specified in num_features and cat_features
    # remainder="drop" #This will drop the name and imdb_id columns that are not specified
)

In [20]:
#Building a logistic regression model
log_model = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)
log_reg = Pipeline(steps =[
    ('preprocessor', preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("regressor", log_model)
    ])

In [21]:
#train model
log_reg.fit(X_tr, y_tr)

0,1,2
,steps,"[('preprocessor', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [22]:
#Save model with pickle
import pickle
from pathlib import Path
path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\clf_models" + "\\"
with open (path + "log_reg_smote.pickle", "wb") as to_write:
    pickle.dump(log_reg, to_write)

#The model is saved as lon_reg.pickle in the clf_models folder, I should comment out the saving and the fit part, but the model didnt take long to train so I won't comment it out

In [23]:
#predict on test set
y_pred = log_reg.predict(X_val)
y_pred_proba = log_reg.predict_proba(X_val)[:, 1]  # for ROC-AUC
y_pred_proba

array([0.50650491, 0.62477888, 0.38597656, ..., 0.92473041, 0.52106821,
       0.51818448], shape=(2328,))

In [24]:

# --- Evaluation ---
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, zero_division=0)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_proba)

print(f"Accuracy:  {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"ROC-AUC:   {roc_auc:.3f}")

# Optional detailed reports
print("\nClassification Report:")
print(classification_report(y_val, y_pred, digits=3))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

Accuracy:  0.700
Precision: 0.533
Recall:    0.683
F1-score:  0.599
ROC-AUC:   0.774

Classification Report:
              precision    recall  f1-score   support

           0      0.821     0.709     0.761      1565
           1      0.533     0.683     0.599       763

    accuracy                          0.700      2328
   macro avg      0.677     0.696     0.680      2328
weighted avg      0.727     0.700     0.708      2328


Confusion Matrix:
[[1109  456]
 [ 242  521]]


In [None]:
def model_dev_classification(model, model_name, preprocessor, X_tr, X_val, y_tr, y_val, use_smote=True):
    """
    Trains a classification model using a preprocessing pipeline,
    (optionally) applies SMOTE for class balancing,
    evaluates it, saves the model, and returns a summary DataFrame.
    """

    #SMOTE is enabled, imblearn pipeline is used and inserted before classifier
    if use_smote:
        steps = [
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('classifier', model)
        ]
    else:
        steps = [
            ('preprocessor', preprocessor),
            ('classifier', model)
        ]

    clf = Pipeline(steps=steps)

    #  If model supports class_weight and SMOTE is NOT used, apply it automatically
    if not use_smote and hasattr(model, "class_weight"):
        model.set_params(class_weight='balanced')

    #  Train model
    clf.fit(X_tr, y_tr)

    # Save model
    path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\clf_models"
    filename = f"{model_name}{'_smote' if use_smote else ''}.pickle"
    with open(f"{path}\\{filename}", "wb") as f:
        pickle.dump(clf, f)

    #  Predict on validation set
    y_pred = clf.predict(X_val)
    y_pred_proba = clf.predict_proba(X_val)[:, 1] if hasattr(clf.named_steps['classifier'], "predict_proba") else None

    #  Compute metrics
    acc = round(accuracy_score(y_val, y_pred), 3)
    prec = round(precision_score(y_val, y_pred, zero_division=0), 3)
    rec = round(recall_score(y_val, y_pred), 3)
    f1 = round(f1_score(y_val, y_pred), 3)
    roc_auc = round(roc_auc_score(y_val, y_pred_proba), 3) if y_pred_proba is not None else np.nan

    #  Create result DataFrame
    result_df = pd.DataFrame({
        "Model": [model_name + ("_SMOTE" if use_smote else "")],
        "Accuracy": [acc],
        "Precision": [prec],
        "Recall": [rec],
        "F1_Score": [f1],
        "ROC_AUC": [roc_auc]
    })

    globals()[f"{model_name}_result"] = result_df
    return result_df

In [19]:
# Example models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss')
}

results = []

for name, model in models.items():
    res = model_dev_classification(model, name, preprocessor, X_tr, X_val, y_tr, y_val)
    results.append(res)

final_results = pd.concat(results, ignore_index=True)
final_results.sort_values(by="F1_Score", ascending=False)



Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score,ROC_AUC
3,GradientBoosting_SMOTE,0.744,0.6,0.654,0.626,0.809
2,RandomForest_SMOTE,0.756,0.636,0.596,0.616,0.806
5,XGBoost_SMOTE,0.754,0.63,0.602,0.616,0.815
4,AdaBoost_SMOTE,0.702,0.535,0.693,0.604,0.758
0,LogisticRegression_SMOTE,0.7,0.533,0.683,0.599,0.774
1,DecisionTree_SMOTE,0.695,0.533,0.564,0.548,0.661


In [17]:
#I am going with the xgboost classifier as it has the best f1 score amongst the models I can tune, you can try to tune other models too
#Or try increasing the threshold for success prediction for the logistic regression model
path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\clf_models" + "\\"
with open (path + "XGBoost.pickle", "rb") as to_read:
    xgb_clf= pickle.load(to_read)

In [18]:
#fine tuning can be done using GridSearchCV or RandomizedSearchCV from sklearn.model_selection
param_distributions = {
    # IMPORTANT: Notice the 'classifier__' prefix. This is required
    # to tell the pipeline which step to apply the parameters to.
    'classifier__n_estimators': [100, 200, 500],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7, 9],
    'classifier__subsample': [0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'classifier__gamma': [0, 0.1, 0.5, 1]
}


# Set up RandomizedSearchCV
# We'll search over 50 different combinations ('n_iter=50')
# We'll use 5-fold cross-validation ('cv=5')
# We want to optimize for F1-score because of the imbalance ('scoring='f1'')
random_search = RandomizedSearchCV(
    estimator= xgb_clf,
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings that are sampled.
    cv=5,       # Number of cross-validation folds.
    scoring='f1', # The metric to optimize. 'roc_auc' is also a great choice.
    n_jobs=-1,  # Use all available CPU cores.
    random_state=42,
    verbose=2 # Shows progress
)

# Run the search on your training data
print("--- Starting Randomized Search ---")
random_search.fit(X_tr, y_tr)


# Get the best results
print("\n--- Best Parameters Found ---")
print(random_search.best_params_)

print("\n--- Best F1-Score from Cross-Validation ---")
print(random_search.best_score_)


# Evaluate the best model on the unseen validation set
# The 'random_search' object now contains the best version of the pipeline
best_model = random_search.best_estimator_
y_pred_val = best_model.predict(X_val)

print("\n--- Classification Report on Validation Set ---")
print(classification_report(y_val, y_pred_val))


--- Starting Randomized Search ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits

--- Best Parameters Found ---
{'classifier__subsample': 0.7, 'classifier__n_estimators': 500, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.1, 'classifier__gamma': 0, 'classifier__colsample_bytree': 0.9}

--- Best F1-Score from Cross-Validation ---
0.6259379301747471

--- Classification Report on Validation Set ---
              precision    recall  f1-score   support

           0       0.79      0.88      0.83      1565
           1       0.69      0.52      0.59       763

    accuracy                           0.76      2328
   macro avg       0.74      0.70      0.71      2328
weighted avg       0.76      0.76      0.75      2328



In [21]:
  path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\clf_models" + "\\"
  filename = "RS_XBG.pickle"
  with open(path + filename, "wb") as f:
    pickle.dump(random_search.best_estimator_, f)

In [22]:
print(f1_score(y_val, y_pred_val))
print(roc_auc_score(y_val, y_pred_val))
print(accuracy_score(y_val, y_pred_val))
print(recall_score(y_val, y_pred_val))
print(precision_score(y_val, y_pred_val))

0.591044776119403
0.7016744898856457
0.7646048109965635
0.5190039318479686
0.6863084922010398


In [None]:
#Define the model and pipeline
neg_count = y_tr.value_counts()[0]
pos_count = y_tr.value_counts()[1]
scale_pos_weight = neg_count / pos_count


# Define the hyperparameter grid for GridSearchCV
# These ranges are centered around the best parameters found by RandomizedSearchCV.
param_grid = {
    # IMPORTANT: Notice the 'classifier__' prefix.
    'classifier__n_estimators': [450, 500, 550],
    'classifier__learning_rate': [0.05, 0.1, 0.15],
    'classifier__max_depth': [2, 3, 4],
    'classifier__subsample': [0.6, 0.7, 0.8],
    'classifier__colsample_bytree': [0.8, 0.9, 1.0],
    'classifier__gamma': [0, 0.05, 0.1]
}


# 3. Set up GridSearchCV
# We'll use 5-fold cross-validation ('cv=5')
# We want to optimize for F1-score because of the imbalance ('scoring='f1'')
grid_search = GridSearchCV(
    estimator= xgb_clf,
    param_grid=param_grid,
    cv=5,       # Number of cross-validation folds.
    scoring='f1', # The metric to optimize. 'roc_auc' is also a great choice.
    n_jobs=-1,  # Use all available CPU cores.
    verbose=2 # Shows progress
)

# 4. Run the search on your training data
print("--- Starting Grid Search ---")
grid_search.fit(X_tr, y_tr)


# 5. Get the best results
print("\n--- Best Parameters Found ---")
print(grid_search.best_params_)

print("\n--- Best F1-Score from Cross-Validation ---")
print(grid_search.best_score_)


# 6. Evaluate the best model on the unseen validation set
# The 'grid_search' object now contains the best version of the pipeline
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_val)

print("\n--- Classification Report on Validation Set ---")
print(classification_report(y_val, y_pred_val))



--- Starting Grid Search ---
Fitting 5 folds for each of 729 candidates, totalling 3645 fits


In [None]:
  path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\clf_models" + "\\"
  filename = "GS_XBG.pickle"
  with open(path + filename, "wb") as f:
    pickle.dump(grid_search.best_estimator_, f)

In [None]:
print(f1_score(y_val, y_pred_val))
print(roc_auc_score(y_val, y_pred_val))
print(accuracy_score(y_val, y_pred_val))
print(recall_score(y_val, y_pred_val))
print(precision_score(y_val, y_pred_val))


0.5897435897435898
0.7012733492728803
0.7663230240549829
0.5124508519003932
0.69449378330373
