In [None]:
#Installing initial libraries, others will be installed as we go
import pandas as pd
import numpy as np
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, r2_score, mean_squared_error, mean_absolute_error

: 

In [None]:
#Load Dataframe
path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\data\processed\cleaned_movies_data.csv"
df = pd.read_csv(path)

In [None]:
#Note that because I am using a cleaned data, I do not need this datacleaner function, but if the data is not, then you may need it
#you can refer to the data_cleaning notebook to see the data cleaning process and maybe why
#Tranformer for data cleaning (I'd not include it in my pipeline)
#dependong on your data or what you want to do, you may need to drop duplicates with subset ="name" and/or dropna on ratings outside this function
#Or depending on what you want, you can use SimpleImputer() to put mean or median or mode, also, make_pipeline to include multiple pipelines
#I dropped na because the rows without values actually do not have any values on the site and they were very small i think 5 entries(not very significant)
class DataCleaner(BaseEstimator, TransformerMixin):
    #BaseEstimator allows you use functions like get_params, set_params, TransformerMixin allows fit and transform
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything from the data, so we just return self.
        return self

    def transform(self, X, y=None):
        # Make a copy to avoid changing the original data
        X_copy = X.copy()

        # --- Apply cleaning functions from your notebook ---

        # 1. Clean vote_count
        def clean_vote(val):
            if isinstance(val, str):
                val = val.lower().strip()
                if "k" in val:
                    return int(float(val.replace("k", "")) * 1000)
                elif "m" in val:
                    return int(float(val.replace("m", "")) * 1_000_000)
                else:
                    return int(float(val))
            return val
        
        if 'vote_count' in X_copy.columns:
            X_copy['vote_count'] = X_copy['vote_count'].apply(clean_vote)

        # 2. Clean movie_duration
        def clean_duration(val):
            if isinstance(val, str):
                hours = 0
                minutes = 0
                h_match = re.search(r"(\d+)\s*h", val.lower())
                if h_match:
                    hours = int(h_match.group(1))
                m_match = re.search(r"(\d+)\s*m", val.lower())
                if m_match:
                    minutes = int(m_match.group(1))
                return hours * 60 + minutes
            return val
            
        if 'movie_duration' in X_copy.columns:
            X_copy['movie_duration'] = X_copy['movie_duration'].apply(clean_duration).astype(float)

        # 3. Drop rows with zero movie_duration
        if 'movie_duration' in X_copy.columns:
             X_copy = X_copy[X_copy['movie_duration'] > 0]

        # 4. Drop rows with 'Metascore' in movie_certification
        if 'movie_certification' in X_copy.columns:
            X_copy = X_copy[~X_copy['movie_certification'].str.contains("Metascore", na=False)]
            
        return X_copy

In [None]:
#Define Features X and Y
X = df.drop("ratings", axis=1)
y = df["ratings"]

In [None]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 
#0.25 to set 25% of the data as test and random_state to make sure we get the same results every time we run the code 42 is just a number, could be any other number

In [None]:
#column groups for pipeline
num_features = ['year', 'vote_count', 'movie_duration']
cat_features = ['genre', 'movie_certification']

In [None]:
#Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
    ("num", StandardScaler(), num_features), #scaler helps to avoid large numbers that can skew the data
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ],
    remainder="drop" #This will drop the name and imdb_id columns that are not specified
)

In [None]:
#Building a linear regression model
lin_model = LinearRegression(n_jobs= -1)
lin_reg = Pipeline(steps =[
    ('preprocessor', preprocessor),
    ("regressor", lin_model)
    ])

In [None]:
#train model
lin_reg.fit(X_train, y_train)

In [None]:
#Save model with pickle
import pickle
path = r"C:\Users\HP\Desktop\UI\important\Data Science\movie-success-prediction\models"
with open (path + "lin_reg.pickle", "wb") as to_write:
    pickle.dump(lin_reg, to_write)

#The model is saved as lin_reg.pickle in the models folder, I should comment out the saving and the fit part, but the model didnt take long to train so I won't comment it out

In [None]:
#Read the saved model
with open (path + "lin_reg.pickle", "rb") as to_read:
    lin_reg = pickle.load(to_read)

In [None]:
#predict on test set
y_pred = lin_reg.predict(X_test)

In [None]:
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

In [None]:
#Decision Tree Regressor model
from sklearn.tree import DecisionTreeRegressor
tree_model = DecisionTreeRegressor(random_state=42)
tree_reg = Pipeline(steps =[
    ('preprocessor', preprocessor),
    ("regressor", tree_model)
    ])


In [None]:
#train model with decision tree
tree_reg.fit(X_train, y_train)

In [None]:
#save tree model
with open(path+ "tree_reg.pickle", "wb") as to_write:
    pickle.dump(tree_reg, to_write)

In [None]:
#read tree model
with open(path+"tree_reg.pickle", "rb") as to_read:
    tree_reg = pickle.load(to_read)


In [None]:
#predict test data
y_pred = tree_reg.predict(X_test)

In [None]:
#Model Evaluation
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

In [None]:
#Random Forest Models and Ensemble models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

rf = RandomForestRegressor(random_state=42)
rf_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
("randomforestregressor", rf)
])
rf_reg.fit(X_train, y_train)


In [None]:
#save model
with open(path+"rf_reg.pickle", "wb") as to_write:
    pickle.dump(rf_reg, to_write)

In [None]:
#read model
with open(path+"rf_reg.pickle", "rb") as to_read:
    rf_reg = pickle.load(to_read)

In [None]:
#predict with model
y_pred = rf_reg.predict(X_test)

In [None]:
#Evaluation
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

In [None]:
#Gradientboosting model
gb = GradientBoostingRegressor(random_state=42)
gb_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
("gradientregressor", gb)
])
gb_reg.fit(X_train, y_train)

In [None]:
#save model
with open(path+"gb_reg.pickle", "wb") as to_write:
    pickle.dump(gb_reg, to_write)

In [None]:
#read model
with open(path+"gb_reg.pickle", "rb") as to_read:
    gb_reg = pickle.load(to_read)

In [None]:
#predict with model
y_pred = gb_reg.predict(X_test)

In [None]:
#Evaluation
#Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

ab = AdaBoostRegressor(random_state=42)



In [None]:
xgb = XGBRegressor(random_state=42)