In [41]:
#This notebook is for model development but we are dropping movivies with movie duration greater than 300 minutes as they are outliers
#might also do some feature engineering here
#Installing initial libraries, others will be installed as we go
import pandas as pd
import numpy as np
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [27]:
#Load Dataframe
path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\data\processed\cleaned_movies_data.csv"
df = pd.read_csv(path)

In [28]:
#in a previous notebook, I defined a function that can be used in a pipeline to to clean the data, you can check models_dev_outliers.ipynb for that.

In [29]:
#some more feature engineering
# creating a new feature 'popularity_score' as a combination of 'ratings' and 'vote_count' to better capture movie popularity
df['popularity_score'] = (df['ratings'] * np.log1p(df['vote_count'])).astype(int)

df['decade'] = (df['year'] // 10) * 10
# This will turn 1998 into 1990, 2012 into 2010, etc

In [30]:
#Drop outliers
df = df[df["movie_duration"] <= 300] #keeping movies with duration less than 300 minutes
#Define Features X and Y
X = df.drop(["ratings", "imdb_id","name"], axis=1)
y = df["ratings"]

In [31]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 
#0.25 to set 25% of the data as test and random_state to make sure we get the same results every time we run the code 42 is just a number, could be any other number
#split the training set for validation during model training
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) 
#0.2 to set 20% of the training data as validation data

In [32]:
#column groups for pipeline
num_features = ['year', 'vote_count', 'movie_duration', 'popularity_score', 'decade']
cat_features = ['genre', 'movie_certification']

In [33]:
#Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
    ("num", StandardScaler(), num_features), #to standardize the data
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    remainder="passthrough" #This will allow other columns not specified in num_features and cat_features to pass through without any changes
)

In [34]:
#Building a linear regression model
lin_model = LinearRegression(n_jobs= -1)
lin_reg = Pipeline(steps =[
    ('preprocessor', preprocessor),
    ("regressor", lin_model)
    ])

In [35]:
#train model
lin_reg.fit(X_tr, y_tr)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,-1
,positive,False


In [None]:
#Save model with pickle
import pickle
path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\models" + "\\" #pathlib can also be used
with open (path + "Nlin_reg.pickle", "wb") as to_write:
    pickle.dump(lin_reg, to_write)

#The model is saved as lin_reg.pickle in the models folder, I should comment out the saving and the fit part, but the model didnt take long to train so I won't comment it out

In [37]:
#Read the saved model
with open (path + "Nlin_reg.pickle", "rb") as to_read:
    lin_reg = pickle.load(to_read)

In [None]:
#predict on valuation set
y_pred = lin_reg.predict(X_val)

In [39]:
#Model evaluation
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

MAE: 0.460
MSE: 0.434
RMSE: 0.659
R²: 0.628


In [55]:
#define a function that develops the model and saves it and runs the test and saves the result as df
#I'll concatenate the results of each model to find the best model
#but in the last notebook, the function will run for all at once and return the concanated result
def model_dev(model, model_name):
    reg = Pipeline(steps =[
    ('preprocessor', preprocessor),
    ("regressor", model)
    ])
    #train model
    reg.fit(X_tr, y_tr)
    #Save model with pickle
    path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\models" + "\\"
    filename = f"{model_name}.pickle"
    with open (path + filename, "wb") as to_write:
        pickle.dump(reg, to_write)
    #Read the saved model
    with open (path + filename, "rb") as to_read:
        loaded_model = pickle.load(to_read)
    #predict on test set
    y_pred = loaded_model.predict(X_val)
    #Model evaluation
    mae = round(mean_absolute_error(y_val, y_pred), 3)
    mse = round(mean_squared_error(y_val, y_pred), 3)
    rmse = round(np.sqrt(mse), 3)
    r2 = round(r2_score(y_val, y_pred), 3)
    #make result dataframe
    result_df = pd.DataFrame({"model": [model_name],
                                    "MAE": [mae],
                                    "MSE":[mse],
                                    "RMSE":[rmse],
                                    "R2":[r2]})
    globals()[f"{model_name}_result"] = result_df
    return result_df

#applying function not necessary, but to confirm the result above
lin_model = LinearRegression(n_jobs= -1)
model_dev(lin_model, "lin_reg")

Unnamed: 0,model,MAE,MSE,RMSE,R2
0,lin_reg,0.46,0.434,0.659,0.628


In [56]:
#Decision Tree Regressor model
from sklearn.tree import DecisionTreeRegressor
tree_model = DecisionTreeRegressor(random_state=42)
model_dev(tree_model, "tree_reg")
all_results = pd.concat([lin_reg_result, tree_reg_result], ignore_index=True)
all_results


Unnamed: 0,model,MAE,MSE,RMSE,R2
0,lin_reg,0.46,0.434,0.659,0.628
1,tree_reg,0.055,0.013,0.114,0.989


In [None]:
#train model with decision tree
tree_reg.fit(X_tr, y_tr)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
#save tree model
with open(path+ "Ntree_reg.pickle", "wb") as to_write:
    pickle.dump(tree_reg, to_write)

In [None]:
#read tree model
with open(path+"Ntree_reg.pickle", "rb") as to_read:
    tree_reg = pickle.load(to_read)


In [None]:
#predict test data
y_pred = tree_reg.predict(X_val)

In [None]:
#Model Evaluation
#Model evaluation
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

MAE: 0.850
MSE: 1.400
RMSE: 1.183
R²: -0.296


In [28]:
#Random Forest Models and Ensemble models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

rf = RandomForestRegressor(random_state=42)
rf_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
("randomforestregressor", rf)
])
rf_reg.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('randomforestregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
#save model
with open(path+"rf_reg.pickle", "wb") as to_write:
    pickle.dump(rf_reg, to_write)

In [30]:
#read model
with open(path+"rf_reg.pickle", "rb") as to_read:
    rf_reg = pickle.load(to_read)

In [31]:
#predict with model
y_pred = rf_reg.predict(X_test)

In [32]:
#Evaluation
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

MAE: 0.619
MSE: 0.742
RMSE: 0.862
R²: 0.313


In [33]:
#Gradientboosting model
gb = GradientBoostingRegressor(random_state=42)
gb_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
("gradientregressor", gb)
])
gb_reg.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('gradientregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [34]:
#save model
with open(path+"gb_reg.pickle", "wb") as to_write:
    pickle.dump(gb_reg, to_write)

In [35]:
#read model
with open(path+"gb_reg.pickle", "rb") as to_read:
    gb_reg = pickle.load(to_read)

In [36]:
#predict with model
y_pred = gb_reg.predict(X_test)

In [37]:
#Evaluation
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

MAE: 0.616
MSE: 0.712
RMSE: 0.844
R²: 0.340


In [38]:
#Instantiate and train AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor


ab = AdaBoostRegressor(random_state=42)
ab_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
("adaboostregressor", ab)
])
ab_reg.fit(X_train, y_train)



0,1,2
,steps,"[('preprocessor', ...), ('adaboostregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,
,n_estimators,50
,learning_rate,1.0
,loss,'linear'
,random_state,42


In [40]:
#save model
with open(path+"ab_reg.pickle", "wb") as to_write:
    pickle.dump(ab_reg, to_write)

In [41]:
#read model
with open(path+"ab_reg.pickle", "rb") as to_read:
    ab_reg = pickle.load(to_read)

In [42]:
#predict with model
y_pred = ab_reg.predict(X_test)

In [43]:
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

MAE: 0.838
MSE: 1.110
RMSE: 1.053
R²: -0.027


In [44]:
#instantiate and train XGBRegressor
from xgboost import XGBRegressor
xgb = XGBRegressor(random_state=42)
xgb_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
("xgbregressor", xgb)
])
xgb_reg.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('xgbregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [45]:
#save model
with open(path+"xgb_reg.pickle", "wb") as to_write:
    pickle.dump(xgb_reg, to_write)

In [46]:
#read model
with open(path+"ab_reg.pickle", "rb") as to_read:
    ab_reg = pickle.load(to_read)

In [49]:
#Predict with model
y_pred = xgb_reg.predict(X_test)

In [50]:
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")


MAE: 0.613
MSE: 0.739
RMSE: 0.859
R²: 0.316


In [None]:
#Depending on what you want, you might want to consider Xgboost for your search as it is less computationally expensive and faster than gradientboosting expensive compared to gradientboosting
#I should have used a validation set but it escaped my mind
#model selection based on predicting on test setis tunning the model on the test set, which might lead to overfitting
#I'd be using the gradientboosting model because it has the best overall score for randomized and grid search

from sklearn.model_selection import RandomizedSearchCV

# These are the parameter ranges for RandomizedSearchCV to sample from.
# This works by telling the search to apply the parameter to the 'gradientregressor' step
param_dist = {
    'gradientregressor__n_estimators': [100, 200, 300, 500, 1000],
    'gradientregressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'gradientregressor__max_depth': [3, 5, 8, 10],
    'gradientregressor__subsample': [0.7, 0.8, 0.9, 1.0],
    'gradientregressor__min_samples_split': [2, 5, 10],
    'gradientregressor__min_samples_leaf': [1, 2, 4],
    'gradientregressor__loss': ['squared_error', 'huber', 'absolute_error'] 
}



# 3. Set up RandomizedSearchCV
# n_iter: Number of random combinations to try. A good starting point is 50-100.
# cv: Number of cross-validation folds. 5 is a common choice.
# n_jobs=-1: Use all available CPU cores to speed up the process.
random_search = RandomizedSearchCV(
    estimator=gb_reg, #base model that I had set up earlier
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [None]:
#fit model
#random_search.fit(X_train, y_train) This line is commented out because it takes a long time to run, you can uncomment it if you want to run it

Fitting 5 folds for each of 50 candidates, totalling 250 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'gradientregressor__learning_rate': [0.01, 0.05, ...], 'gradientregressor__loss': ['squared_error', 'huber', ...], 'gradientregressor__max_depth': [3, 5, ...], 'gradientregressor__min_samples_leaf': [1, 2, ...], ...}"
,n_iter,50
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'huber'
,learning_rate,0.05
,n_estimators,500
,subsample,0.7
,criterion,'friedman_mse'
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_depth,5
,min_impurity_decrease,0.0


In [None]:
#save model
#with open(path+"random_search.pickle", "wb") as to_write:
    #pickle.dump(random_search, to_write)

In [61]:
#read model
with open(path+"random_search.pickle", "rb") as to_read:
    random_search = pickle.load(to_read)

In [62]:
#check best parameters
print("Best Parameters:", random_search.best_params_)

Best Parameters: {'gradientregressor__subsample': 0.7, 'gradientregressor__n_estimators': 500, 'gradientregressor__min_samples_split': 5, 'gradientregressor__min_samples_leaf': 2, 'gradientregressor__max_depth': 5, 'gradientregressor__loss': 'huber', 'gradientregressor__learning_rate': 0.05}


In [65]:

# Convert the results dictionary to a Pandas DataFrame
results_df = pd.DataFrame(random_search.cv_results_)

# Sort the DataFrame by rank to see the best results at the top
results_df = results_df.sort_values(by="rank_test_score")

# Display the most important columns for the top 5 models
results_df[[
    'rank_test_score',
    'mean_test_score',
    'std_test_score',
    'params'
]].head(5)

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
45,1,0.36483,0.010551,"{'gradientregressor__subsample': 0.7, 'gradien..."
49,2,0.36307,0.010255,"{'gradientregressor__subsample': 0.8, 'gradien..."
13,3,0.362836,0.009076,"{'gradientregressor__subsample': 0.8, 'gradien..."
19,4,0.358235,0.012063,"{'gradientregressor__subsample': 0.8, 'gradien..."
40,5,0.357186,0.009033,"{'gradientregressor__subsample': 1.0, 'gradien..."


In [66]:
#predict with the best estimator
y_pred = random_search.best_estimator_.predict(X_test)

In [67]:
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")


MAE: 0.587
MSE: 0.684
RMSE: 0.827
R²: 0.367


In [None]:
#The random search did not improve the model a lot, so, I would leave the base model, and considering the computational cost, I wouldn't do a gridsearch, it is not assuring that it would be different.
#What can be done is try other feature engineering, like i would remove outliers in another notebook.
#You can scrape for more things, include budget maybe, or other things that might be relevant