In [None]:
# -*- coding: utf-8 -*-
# IMDB_Predictor_Colab_with_tqdm.ipynb
# Automatically generated by Colab

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from xgboost import XGBRegressor
from tqdm import tqdm
import numpy as np

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/colab_data/full_set.csv')

# Display basic info about the dataset
print("Basic information about the dataset:")
print(data.info())

# Check for missing values in each column
print("\nNull values in the dataset:")
print(data.isnull().sum())

# Check for rows with a budget value of 0
zero_budget_count = (data['budget'] == 0).sum()
print(f"\nNumber of rows with a budget of 0: {zero_budget_count}")

# Check for rows with a revenue value of 0
zero_revenue_count = (data['revenue'] == 0).sum()
print(f"Number of rows with a revenue of 0: {zero_revenue_count}")

# Drop rows where target 'vote_average' is missing
data = data.dropna(subset=['vote_average'])

# Split release_date into year for simplicity
data['release_year'] = pd.to_datetime(data['release_date'], errors='coerce').dt.year

# Handle missing values (drop rows with missing features)
features = ['runtime', 'release_year', 'genres', 'production_companies', 'production_countries',
            'original_language', 'directors', 'main_characters', 'keywords']
data = data.dropna(subset=features)

# data = data[:30000]
# Prepare features and target
X = data[features]
y = data['vote_average']

# Preprocessing pipeline for numerical and categorical data
numerical_features = ['runtime', 'release_year']
categorical_features = ['genres', 'production_companies', 'production_countries',
                        'original_language', 'directors', 'main_characters', 'keywords']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529510 entries, 0 to 529509
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    529510 non-null  int64  
 1   title                 529510 non-null  object 
 2   vote_average          529510 non-null  float64
 3   vote_count            529510 non-null  int64  
 4   status                529510 non-null  object 
 5   release_date          529510 non-null  object 
 6   revenue               529510 non-null  int64  
 7   runtime               529510 non-null  int64  
 8   adult                 529510 non-null  bool   
 9   budget                529510 non-null  int64  
 10  imdb_id               529510 non-null  object 
 11  original_language     529510 non-null  object 
 12  original_title        529510 non-null  object 
 13  overview              529259 non-null  object 
 14  popularity     

In [None]:

# # XGBoost Model
# print("\nTraining XGBoost...")
# XGBoost_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', XGBRegressor(n_estimators=100, random_state=42))
# ])

# for _ in tqdm(range(1), desc="Training XGBoost"):
#     XGBoost_model_pipeline.fit(X_train, y_train)

# y_pred_xgb = XGBoost_model_pipeline.predict(X_test)
# print(f'XGBoost Mean Squared Error: {mean_squared_error(y_test, y_pred_xgb)}')
# joblib.dump(XGBoost_model_pipeline, '/content/imdb_rating_XGBoost_model.pkl')



Training XGBoost...


Training XGBoost: 100%|██████████| 1/1 [02:50<00:00, 170.72s/it]


XGBoost Mean Squared Error: 7.012172458761115


['/content/imdb_rating_XGBoost_model.pkl']

In [None]:
# Import any additional necessary libraries if not already imported
from sklearn.metrics import mean_absolute_error

# Train the XGBoost model
print("\nTraining XGBoost...")
XGBoost_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=100, random_state=42))
])

for _ in tqdm(range(1), desc="Training XGBoost"):
    XGBoost_model_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = XGBoost_model_pipeline.predict(X_test)

# Evaluation Metrics
mse = mean_squared_error(y_test, y_pred_xgb)
mae = mean_absolute_error(y_test, y_pred_xgb)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_xgb)

# Print evaluation results
print(f'XGBoost Mean Squared Error: {mse:.2f}')
print(f'XGBoost Mean Absolute Error: {mae:.2f}')
print(f'XGBoost Root Mean Squared Error: {rmse:.2f}')
print(f'XGBoost R-squared: {r2:.2f}')

# Save the model
joblib.dump(XGBoost_model_pipeline, '/content/imdb_rating_XGBoost_model.pkl')


Training XGBoost...


Training XGBoost: 100%|██████████| 1/1 [02:49<00:00, 169.23s/it]


XGBoost Mean Squared Error: 7.01
XGBoost Mean Absolute Error: 2.13
XGBoost Root Mean Squared Error: 2.65
XGBoost R-squared: 0.30


In [None]:

# # Linear Regression Model
# print("\nTraining Linear Regression...")
# LinearRegression_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', LinearRegression())
# ])

# for _ in tqdm(range(1), desc="Training Linear Regression"):
#     LinearRegression_model_pipeline.fit(X_train, y_train)

# y_pred_lr = LinearRegression_model_pipeline.predict(X_test)
# print(f'Linear Regression Mean Squared Error: {mean_squared_error(y_test, y_pred_lr)}')
# joblib.dump(LinearRegression_model_pipeline, '/content/imdb_rating_LinearRegression_model.pkl')



Training Linear Regression...


Training Linear Regression: 100%|██████████| 1/1 [04:22<00:00, 262.93s/it]


Linear Regression Mean Squared Error: 9.959100500118387


['/content/imdb_rating_LinearRegression_model.pkl']

In [None]:
# Train the Linear Regression model
print("\nTraining Linear Regression...")
LinearRegression_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

for _ in tqdm(range(1), desc="Training Linear Regression"):
    LinearRegression_model_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = LinearRegression_model_pipeline.predict(X_test)

# Evaluation Metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Print evaluation results
print(f'Linear Regression Mean Squared Error: {mse_lr:.2f}')
print(f'Linear Regression Mean Absolute Error: {mae_lr:.2f}')
print(f'Linear Regression Root Mean Squared Error: {rmse_lr:.2f}')
print(f'Linear Regression R-squared: {r2_lr:.2f}')

# Save the model
joblib.dump(LinearRegression_model_pipeline, '/content/imdb_rating_LinearRegression_model.pkl')


Training Linear Regression...


Training Linear Regression: 100%|██████████| 1/1 [04:12<00:00, 252.21s/it]


Linear Regression Mean Squared Error: 9.96
Linear Regression Mean Absolute Error: 2.45
Linear Regression Root Mean Squared Error: 3.16
Linear Regression R-squared: 0.01


['/content/imdb_rating_LinearRegression_model.pkl']

In [None]:

# # Logistic Regression Model (Classification Approach)
# # Binarize the target (consider ratings above 7 as 'high')
# print("\nTraining Logistic Regression for Classification...")
# y_train_binary = (y_train > 7).astype(int)
# y_test_binary = (y_test > 7).astype(int)

# LogisticRegression_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', LogisticRegression(max_iter=1000, random_state=42))
# ])

# for _ in tqdm(range(1), desc="Training Logistic Regression"):
#     LogisticRegression_model_pipeline.fit(X_train, y_train_binary)

# y_pred_logistic = LogisticRegression_model_pipeline.predict(X_test)
# logistic_accuracy = accuracy_score(y_test_binary, y_pred_logistic)
# print(f'Logistic Regression Accuracy: {logistic_accuracy}')
# joblib.dump(LogisticRegression_model_pipeline, '/content/imdb_rating_LogisticRegression_model.pkl')


Training Logistic Regression for Classification...


Training Logistic Regression: 100%|██████████| 1/1 [01:11<00:00, 71.34s/it]


Logistic Regression Accuracy: 0.9060293421675343

All models trained and saved successfully!


In [None]:
# Train the Logistic Regression model for classification
print("\nTraining Logistic Regression for Classification...")
y_train_binary = (y_train > 7).astype(int)
y_test_binary = (y_test > 7).astype(int)

LogisticRegression_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000, random_state=42))
])

for _ in tqdm(range(1), desc="Training Logistic Regression"):
    LogisticRegression_model_pipeline.fit(X_train, y_train_binary)

# Make predictions on the test set
y_pred_logistic = LogisticRegression_model_pipeline.predict(X_test)

# Evaluation Metrics
logistic_accuracy = accuracy_score(y_test_binary, y_pred_logistic)
logistic_precision = precision_score(y_test_binary, y_pred_logistic)
logistic_recall = recall_score(y_test_binary, y_pred_logistic)
logistic_f1 = f1_score(y_test_binary, y_pred_logistic)

# Print evaluation results
print(f'Logistic Regression Accuracy: {logistic_accuracy:.2f}')
print(f'Logistic Regression Precision: {logistic_precision:.2f}')
print(f'Logistic Regression Recall: {logistic_recall:.2f}')
print(f'Logistic Regression F1 Score: {logistic_f1:.2f}')

# Save the model
joblib.dump(LogisticRegression_model_pipeline, '/content/imdb_rating_LogisticRegression_model.pkl')


Training Logistic Regression for Classification...


Training Logistic Regression: 100%|██████████| 1/1 [01:11<00:00, 71.11s/it]


Logistic Regression Accuracy: 0.91
Logistic Regression Precision: 0.63
Logistic Regression Recall: 0.05
Logistic Regression F1 Score: 0.10


['/content/imdb_rating_LogisticRegression_model.pkl']

In [None]:

# # RandomForestRegressor Model
# print("\nTraining RandomForestRegressor...")
# RandomForestRegressor_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', RandomForestRegressor(n_estimators=100, random_state=42))
# ])

# for _ in tqdm(range(1), desc="Training RandomForestRegressor"):
#     RandomForestRegressor_model_pipeline.fit(X_train, y_train)

# y_pred_rf = RandomForestRegressor_model_pipeline.predict(X_test)
# print(f'RandomForestRegressor Mean Squared Error: {mean_squared_error(y_test, y_pred_rf)}')
# joblib.dump(RandomForestRegressor_model_pipeline, '/content/imdb_rating_RandomForestRegressor_model.pkl')



Training RandomForestRegressor...


Training RandomForestRegressor:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Train the RandomForestRegressor model
print("\nTraining RandomForestRegressor...")
RandomForestRegressor_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

for _ in tqdm(range(1), desc="Training RandomForestRegressor"):
    RandomForestRegressor_model_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = RandomForestRegressor_model_pipeline.predict(X_test)

# Evaluation Metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print evaluation results
print(f'RandomForestRegressor Mean Squared Error: {mse_rf:.2f}')
print(f'RandomForestRegressor Mean Absolute Error: {mae_rf:.2f}')
print(f'RandomForestRegressor Root Mean Squared Error: {rmse_rf:.2f}')
print(f'RandomForestRegressor R-squared: {r2_rf:.2f}')

# Save the model
joblib.dump(RandomForestRegressor_model_pipeline, '/content/imdb_rating_RandomForestRegressor_model.pkl')

In [None]:
# Import necessary libraries
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
from tqdm import tqdm

# Assume your data is split into training and testing sets
# X_train, X_test, y_train, y_test

# Define the MLP Regressor pipeline
print("\nTraining MLP Regressor...")
MLPRegressor_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Assuming you have a preprocessor for scaling/encoding
    ('model', MLPRegressor(
        hidden_layer_sizes=(100, 50),  # Two hidden layers: 100 and 50 neurons
        activation='relu',             # Activation function for hidden layers
        solver='adam',                 # Optimization algorithm
        learning_rate='adaptive',      # Adjust learning rate when loss plateaus
        max_iter=500,                  # Maximum number of iterations
        random_state=42
    ))
])

# Training the model
for _ in tqdm(range(1), desc="Training MLP Regressor"):
    MLPRegressor_model_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_mlp = MLPRegressor_model_pipeline.predict(X_test)

# Evaluation Metrics
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
rmse_mlp = np.sqrt(mse_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)

# Print evaluation results
print(f'MLP Regressor Mean Squared Error: {mse_mlp:.2f}')
print(f'MLP Regressor Mean Absolute Error: {mae_mlp:.2f}')
print(f'MLP Regressor Root Mean Squared Error: {rmse_mlp:.2f}')
print(f'MLP Regressor R-squared: {r2_mlp:.2f}')

# Save the model
joblib.dump(MLPRegressor_model_pipeline, '/content/imdb_rating_MLPRegressor_model.pkl')

print("\nMLP Regressor model trained and saved successfully!")



Training MLP Regressor...


Training MLP Regressor:   0%|          | 0/1 [00:00<?, ?it/s]