<h1 style="color: brown;">1. Importing Libraries and Configuring Display Options</h1>

In [1]:
# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time
from sklearn.model_selection import cross_validate
from itertools import product

## Customizing DataFrame Output Formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

<h1 style="color: brown;">2. Exploring Dataset</h1>

<h2 style="color: purple;">2.1 Loading TMDB IMDB Movies Dataset</h2>

In [2]:
df = pd.read_csv("TMDB  IMDB Movies Dataset.csv")

<h2 style="color: purple;">2.2 Displaying Dataset Dimensions</h2> 

In [3]:
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns.')

The data has 432619 rows and 28 columns.


<h2 style="color: purple;">2.3 Displaying All Column Names</h2>

In [4]:
# Get all column names from df
all_column_names = df.columns

# Display columns with numbered list
print("All Columns:")
for idx, col in enumerate(all_column_names, start=1):
    print(f"{idx}. {col}")


All Columns:
1. id
2. title
3. vote_average
4. vote_count
5. status
6. release_date
7. revenue
8. runtime
9. adult
10. backdrop_path
11. budget
12. homepage
13. tconst
14. original_language
15. original_title
16. overview
17. popularity
18. poster_path
19. tagline
20. genres
21. production_companies
22. production_countries
23. spoken_languages
24. keywords
25. directors
26. writers
27. averageRating
28. numVotes


<h2 style="color: purple;">2.4 Identify Numeric and Non-Numeric Features</h2>  

In [5]:
numeric_column_names = df.select_dtypes(include='number').columns
non_numeric_column_names = df.select_dtypes(exclude='number').columns

# Organize the columns in a dictionary for clear display
columns_dict = {
    "Numeric Columns": list(numeric_column_names),
    "Non-Numeric Columns": list(non_numeric_column_names)
}

# Print dictionary in a readable format with numbered lists and unique value counts
for category, columns in columns_dict.items():
    print(f"{category}:")
    for idx, col in enumerate(columns, start=1):
        unique_values_count = df[col].nunique()  # Count unique values
        print(f"  {idx}. {col} (Unique Values: {unique_values_count})")
    print()  # Add an empty line between categories


Numeric Columns:
  1. id (Unique Values: 432508)
  2. vote_average (Unique Values: 4996)
  3. vote_count (Unique Values: 3598)
  4. revenue (Unique Values: 13546)
  5. runtime (Unique Values: 553)
  6. budget (Unique Values: 4016)
  7. popularity (Unique Values: 19439)
  8. averageRating (Unique Values: 91)
  9. numVotes (Unique Values: 19355)

Non-Numeric Columns:
  1. title (Unique Values: 373607)
  2. status (Unique Values: 6)
  3. release_date (Unique Values: 39522)
  4. adult (Unique Values: 2)
  5. backdrop_path (Unique Values: 183350)
  6. homepage (Unique Values: 52857)
  7. tconst (Unique Values: 431756)
  8. original_language (Unique Values: 158)
  9. original_title (Unique Values: 392206)
  10. overview (Unique Values: 389218)
  11. poster_path (Unique Values: 357673)
  12. tagline (Unique Values: 89517)
  13. genres (Unique Values: 10495)
  14. production_companies (Unique Values: 135977)
  15. production_countries (Unique Values: 8297)
  16. spoken_languages (Unique Values

<h2 style="color: purple;">2.5 Displaying First 5 Rows of the Dataset</h2>  

In [6]:
pd.set_option('display.max_columns', None)
df.head(5)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,tconst,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,averageRating,numVotes
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,8.8,2616043
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2202667
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.0,2948172
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,237000000,https://www.avatar.com/movies/avatar,tt0499549,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",James Cameron,James Cameron,7.9,1409493
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,220000000,https://www.marvel.com/movies/the-avengers,tt0848228,en,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1484289


<h2 style="color: purple;">2.6 Checking for Missing Data</h2>  

In [7]:
print(f'Total number of missing values: {df.isnull().sum().sum()}')

Total number of missing values: 1900843


In [8]:
# Calculate the number of missing values per column
missing_values = df.isnull().sum()

# Calculate the percentage of missing values per column
missing_percentage = (missing_values / len(df)) * 100

# Create a DataFrame to display the results
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

# Sort the DataFrame by Missing Percentage in descending order
missing_info = missing_info.sort_values(by='Missing Percentage', ascending=False)

# Display the missing information
print(missing_info)

                      Missing Values  Missing Percentage
homepage                      378487              87.487
tagline                       341651              78.973
keywords                      261865              60.530
backdrop_path                 248986              57.553
production_companies          170338              39.374
production_countries          110914              25.638
spoken_languages              100988              23.343
genres                         77010              17.801
poster_path                    74110              17.131
writers                        67145              15.521
overview                       41142               9.510
release_date                   17830               4.121
directors                      10377               2.399
id                                 0               0.000
popularity                         0               0.000
averageRating                      0               0.000
original_title                 

<h2 style="color: purple;">2.7 Checking for Duplicate Data</h2> 

In [9]:
print(f'There are {df.duplicated().sum()} duplicate rows in the dataset.')

There are 55 duplicate rows in the dataset.


<h2 style="color: purple;">2.8 Summary Statistics for Numeric Columns</h2> 

In [10]:
pd.set_option('display.float_format', '{:.3f}'.format)
numeric_columns = df.select_dtypes(include='number')
numeric_columns.describe()

Unnamed: 0,id,vote_average,vote_count,revenue,runtime,budget,popularity,averageRating,numVotes
count,432619.0,432619.0,432619.0,432619.0,432619.0,432619.0,432619.0,432619.0,432619.0
mean,478565.193,3.622,49.252,1647778.203,66.486,640532.763,2.214,6.248,2813.755
std,343786.121,3.142,514.82,25974703.882,63.23,7122521.227,11.387,1.316,31680.738
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0
25%,207275.5,0.0,0.0,0.0,15.0,0.0,0.6,5.5,20.0
50%,413597.0,4.673,1.0,0.0,80.0,0.0,0.865,6.3,59.0
75%,691021.5,6.2,6.0,0.0,96.0,0.0,1.672,7.1,262.0
max,1392028.0,10.0,34495.0,2923706026.0,14400.0,888000000.0,2994.357,10.0,2966619.0


A vote average is calculated only when there is at least one vote. A vote_average of 0 is usually invalid. 
a runtime of 0 is likely invalid or indicates missing data 
a budget of 0 is unrealistic for movies and may indicate missing or erroneous data. 

 a movie can have a vote count of 0 if It has not been rated or reviewed by users yet (e.g., a newly released or obscure movie). 

it is technically possible for revenue to be 0 in certain scenarios. For example, Some movies, especially those with limited releases, may not have generated any revenue (e.g., no box office earnings).

popularity can be 0 in some datasets, depending on how it is calculated (e.g., based on viewership, search trends, or interactions). Movies with 0 popularity may be very obscure or newly added to the dataset 



<h2 style="color: purple;">2.9 Summary Statistics for Categorical Columns</h2> 

In [11]:
non_numeric_columns = df.select_dtypes(exclude='number')
print("Non-Numeric Columns Summary:")
non_numeric_columns.describe(include='all')

Non-Numeric Columns Summary:


Unnamed: 0,title,status,release_date,adult,backdrop_path,homepage,tconst,original_language,original_title,overview,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers
count,432619,432619,414789,432619,183633,54132,432619,432619,432619,391477,358509,90968,355609,262281,321705,331631,170754,422242,365474
unique,373607,6,39522,2,183350,52857,431756,158,392206,389218,357673,89517,10495,135977,8297,5938,113910,160331,260709
top,Home,Released,2010-01-01,False,/r3RDMpTou68u0dBuhKHcBz2wTel.jpg,http://www.wwe.com,tt32094375,en,A Christmas Carol,Mexican feature film,/cWjdh8VTiizYfQp5m6fJi4PDy8w.jpg,Based on a true story,Drama,Metro-Goldwyn-Mayer,United States of America,English,woman director,Kevin Dunn,William Shakespeare
freq,55,429686,978,419971,24,22,59,232877,38,127,48,23,57119,1975,98615,129080,6710,529,222


<h1 style="color: brown;">3. Preprocessing </h1> 

<h2 style="color: purple;">3.1 Dropping Unnecessary Columns</h2>

In [12]:
df = df.drop(columns= ['id','title','status','release_date','backdrop_path','homepage','tconst','original_language','original_title','overview','poster_path','tagline','production_companies','production_countries','spoken_languages','keywords','directors','writers'])

<h2 style="color: purple;">3.2 Dropping Duplicate Values</h2> 

In [13]:
df = df.drop_duplicates()

<h2 style="color: purple;">3.3 Dropping Null Values</h2> 

In [14]:
df = df.dropna()

<h2 style="color: purple;">3.4 Removing Rows with Zero Values</h2>  

In [15]:
df.drop(
    df[(df.runtime == 0) | (df.budget == 0) | (df.vote_average == 0)].index,
    inplace=True
)

<h2 style="color: purple;">3.5 Converting Genre Strings to One-Hot Encoded Columns</h2>   

In [16]:
def convertList(data):
    return data.split(',')

df['genres'] = df['genres'].apply(convertList)

all_genres = set(genre for genrelist in df['genres'] for genre in genrelist)

for genre in all_genres:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)

df = df.drop('genres', axis=1)

<h2 style="color: purple;">3.6 Applying Label Encoding to 'Adult' Column</h2>

In [17]:
label_encoder = LabelEncoder()

# Fit and transform the Gender column
df['adult'] = label_encoder.fit_transform(df['adult'])

# Display the mapping of labels to values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Encoding Mapping:", label_mapping)

Label Encoding Mapping: {False: 0, True: 1}


<h1 style="color: brown;">4. Development and Assessment of Models</h1> 

<h2 style="color: purple;">4.1 Splitting Features and Target Variable</h2>


In [18]:
X = df.drop(columns=['averageRating'])
y = df['averageRating']

<h2 style="color: purple;">4.2 Splitting Data into Training and Test Sets</h2> 

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: purple;">4.3 Scaling Features using StandardScaler</h2>

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h1 style="color: brown;">5. Evaluating Models and Measuring Performance</h1>  

<h2 style="color: purple;">5.1 Finding the Best Model</h2> 

In [21]:
model_list = [
    LinearRegression(),
    DecisionTreeRegressor(random_state=42),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42),
    AdaBoostRegressor(random_state=42),
    SVR(),
    GaussianProcessRegressor(),
    KNeighborsRegressor(),
]

results = []

for model in model_list:
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    r2 = r2_score(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mse)
    results.append((model.__class__.__name__, r2, mae, mse, rmse))

# Display results
import pandas as pd
df_results = pd.DataFrame(results, columns=["Model", "R2", "MAE", "MSE", "RMSE"])
df_results = df_results.sort_values(by="R2", ascending=False)
print(df_results)

                       Model         R2    MAE       MSE    RMSE
2      RandomForestRegressor      0.696  0.512     0.576   0.759
3  GradientBoostingRegressor      0.678  0.540     0.609   0.780
5                        SVR      0.583  0.606     0.788   0.888
0           LinearRegression      0.512  0.698     0.924   0.961
4          AdaBoostRegressor      0.511  0.764     0.925   0.962
7        KNeighborsRegressor      0.482  0.723     0.979   0.990
1      DecisionTreeRegressor      0.397  0.722     1.140   1.068
6   GaussianProcessRegressor -23934.986 34.694 45290.826 212.816


In [22]:
best_model_name = df_results.iloc[0]["Model"]
print(f"Best model based on R2: {best_model_name}")

Best model based on R2: RandomForestRegressor


<h2 style="color: purple;">5.2 Manual Grid Search for Best Hyperparameters</h2>  

In [23]:
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Updated parameter grid — include default values explicitly
param_grid = {
    'n_estimators': [100, 200, 300],              # default: 100
    'max_depth': [None, 10, 20],                  # default: None
    'min_samples_split': [2, 5],                  # default: 2
    'max_features': ['sqrt', 'log2'],             # default: 'sqrt' for regression
    'min_samples_leaf': [1, 2]                    # default: 1
}

best_score = -np.inf
best_params = None

# Generate all combinations of parameters
for combo in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    
    model = RandomForestRegressor(**params, random_state=42)
    model.fit(X_subtrain, y_subtrain)
    preds = model.predict(X_val)
    score = r2_score(y_val, preds)
    
    print(f"Params: {params} => R2: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_params = params

print("\n✅ Best Parameters from Grid Search:")
print(best_params)
print(f"Best R2 Score on Validation Set: {best_score:.4f}")

Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'max_features': 'sqrt', 'min_samples_leaf': 1} => R2: 0.6805
Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'max_features': 'sqrt', 'min_samples_leaf': 2} => R2: 0.6758
Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'max_features': 'log2', 'min_samples_leaf': 1} => R2: 0.6745
Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'max_features': 'log2', 'min_samples_leaf': 2} => R2: 0.6647
Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1} => R2: 0.6775
Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2} => R2: 0.6758
Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'max_features': 'log2', 'min_samples_leaf': 1} => R2: 0.6726
Params: {'n_estimators': 100, 'max_depth': None, 'min_samples_

<h2 style="color: purple;">5.3 Cross-Validation Evaluation</h2>   

In [24]:
# Best parameters you found earlier
best_params = {
    'n_estimators': 300,
    'max_depth': None,
    'min_samples_split': 2,
    'max_features': 'sqrt',
    'min_samples_leaf': 1
}

# Initialize final model
final_model = RandomForestRegressor(**best_params, random_state=42)

# Define scoring metrics
scoring = {
    'r2': 'r2',
    'neg_mae': 'neg_mean_absolute_error',
    'neg_mse': 'neg_mean_squared_error'
}

# Perform cross-validation
cv_results = cross_validate(final_model, X_train_scaled, y_train, cv=5, scoring=scoring)

# Extract scores
r2_scores = cv_results['test_r2']
mae_scores = -cv_results['test_neg_mae']
mse_scores = -cv_results['test_neg_mse']
rmse_scores = np.sqrt(mse_scores)

# Print individual and mean scores
print("CV R² scores:", r2_scores)
print("Mean CV R²:", np.mean(r2_scores))

print("CV MAE scores:", mae_scores)
print("Mean CV MAE:", np.mean(mae_scores))

print("CV MSE scores:", mse_scores)
print("Mean CV MSE:", np.mean(mse_scores))

print("CV RMSE scores:", rmse_scores)
print("Mean CV RMSE:", np.mean(rmse_scores))


CV R² scores: [0.70742152 0.69652313 0.70162662 0.68770497 0.70204668]
Mean CV R²: 0.699064583648518
CV MAE scores: [0.51250367 0.53616261 0.5209182  0.53676825 0.52530899]
Mean CV MAE: 0.5263323458247592
CV MSE scores: [0.54602764 0.598728   0.57078317 0.6216342  0.57465688]
Mean CV MSE: 0.5823659788822557
CV RMSE scores: [0.73893683 0.77377516 0.75550193 0.78843782 0.75806127]
Mean CV RMSE: 0.7629426027998665


<h2 style="color: purple;">5.4 Final Model Training and Test Set Evaluation</h2>   

In [25]:
# Retrain on the full training data
final_model.fit(X_train_scaled, y_train)
# Predict on the test data
y_test_preds = final_model.predict(X_test_scaled)
def regression_report(y_true, y_pred):
    print("Regression Report:")
    print(f"MAE:  {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"MSE:  {mean_squared_error(y_true, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"R²:   {r2_score(y_true, y_pred):.4f}")
    
# Usage
regression_report(y_test, y_test_preds)

Regression Report:
MAE:  0.5284
MSE:  0.5805
RMSE: 0.7619
R²:   0.6932
