In [None]:
!pip install pandas joblib
!pip install scikit-learn==1.7.2

Collecting scikit-learn==1.7.2
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.7.2


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# --- 1. Load Data (Upload movie_dataset.csv to Colab) ---
try:
    df = pd.read_csv('movie_dataset.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'movie_dataset.csv' not found. Please upload the file to your Colab session.")
    # Exit if the file is not found
    exit()

# --- 2. Data Cleaning and Preprocessing ---
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['box_office'] = pd.to_numeric(df['box_office'], errors='coerce')
df.dropna(subset=['budget', 'box_office'], inplace=True)
df = df[df['budget'] > 0].copy()

df['success'] = (df['box_office'] > df['budget']).astype(int)

def convert_runtime_to_minutes(runtime_str):
    if pd.isna(runtime_str) or runtime_str == 'Not Available':
        return np.nan

    parts = runtime_str.split()
    total_minutes = 0
    if len(parts) >= 2 and 'h' in parts[0]:
        total_minutes += int(parts[0].replace('h', '')) * 60
    if len(parts) >= 2 and 'm' in parts[1]:
        total_minutes += int(parts[1].replace('m', ''))
    return total_minutes

df['run_time_minutes'] = df['run_time'].apply(convert_runtime_to_minutes)

# --- 3. Feature Selection and Splitting Data ---
features = ['year', 'rating', 'genre', 'run_time_minutes', 'budget']
X = df[features]
y = df['success']
X.dropna(inplace=True)
y = y[X.index]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4. Building the Model Pipeline ---
categorical_features = ['rating', 'genre']
numerical_features = ['year', 'run_time_minutes', 'budget']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# --- 5. Train and Evaluate the Model ---
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 6. Save the Trained Model ---
model_filename = 'movie_predictor.joblib'
joblib.dump(model, model_filename)

print(f"\nModel saved to '{model_filename}'. Download this file for your backend.")

Dataset loaded successfully.

Model Performance:
Accuracy: 0.85

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.57      0.57         7
           1       0.91      0.91      0.91        34

    accuracy                           0.85        41
   macro avg       0.74      0.74      0.74        41
weighted avg       0.85      0.85      0.85        41


Model saved to 'movie_predictor.joblib'. Download this file for your backend.


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Changed to Regressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score  # New evaluation metrics
import joblib
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# --- 1. Load Data (Upload movie_dataset.csv to Colab) ---
try:
    df = pd.read_csv('movie_dataset.csv')
    print("Dataset loaded successfully.")

    # --- 2. Data Cleaning and Preprocessing ---
    df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
    df['box_office'] = pd.to_numeric(df['box_office'], errors='coerce')
    df.dropna(subset=['budget', 'box_office'], inplace=True)
    df = df[df['budget'] > 0].copy()

    # The target variable is now 'box_office'
    # df['success'] = (df['box_office'] > df['budget']).astype(int) # This line is removed.

    def convert_runtime_to_minutes(runtime_str):
        if pd.isna(runtime_str) or runtime_str == 'Not Available':
            return np.nan

        parts = runtime_str.split()
        total_minutes = 0
        if len(parts) >= 2 and 'h' in parts[0]:
            total_minutes += int(parts[0].replace('h', '')) * 60
        if len(parts) >= 2 and 'm' in parts[1]:
            total_minutes += int(parts[1].replace('m', ''))
        return total_minutes

    df['run_time_minutes'] = df['run_time'].apply(convert_runtime_to_minutes)

    # --- 3. Feature Selection and Splitting Data ---
    features = ['year', 'rating', 'genre', 'run_time_minutes', 'budget']
    X = df[features]
    y = df['box_office']  # The target is now the box office amount
    X.dropna(inplace=True)
    y = y[X.index]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- 4. Building the Model Pipeline ---
    categorical_features = ['rating', 'genre']
    numerical_features = ['year', 'run_time_minutes', 'budget']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    # Changed the classifier to a regressor
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    # --- 5. Train and Evaluate the Model ---
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\nModel Performance:")
    print(f"Mean Squared Error: {mse:,.2f}")
    print(f"R-squared: {r2:.2f}")

    # --- 6. Save the Trained Model ---
    model_filename = 'movie_revenue_predictor.joblib'  # Changed filename
    joblib.dump(model, model_filename)

    print(f"\nModel saved to '{model_filename}'. Download this file for your backend.")

except FileNotFoundError:
    print("Error: 'movie_dataset.csv' not found. Please upload the file to your Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset loaded successfully.

Model Performance:
Mean Squared Error: 23,580,549,535,182,736.00
R-squared: 0.71

Model saved to 'movie_revenue_predictor.joblib'. Download this file for your backend.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# --- 1. Load Data ---
try:
    df = pd.read_csv('movie_dataset.csv')
    print("Dataset loaded successfully.")

    # --- 2. Data Cleaning and Preprocessing ---
    df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
    df['box_office'] = pd.to_numeric(df['box_office'], errors='coerce')
    # Keep rows with valid budget and box_office for potential use as features
    df.dropna(subset=['budget', 'box_office'], inplace=True)
    df = df[df['budget'] > 0].copy()

    def convert_runtime_to_minutes(runtime_str):
        if pd.isna(runtime_str) or runtime_str == 'Not Available':
            return np.nan

        parts = runtime_str.split()
        total_minutes = 0
        if len(parts) >= 2 and 'h' in parts[0]:
            total_minutes += int(parts[0].replace('h', '')) * 60
        if len(parts) >= 2 and 'm' in parts[1]:
            total_minutes += int(parts[1].replace('m', ''))
        return total_minutes

    df['run_time_minutes'] = df['run_time'].apply(convert_runtime_to_minutes)

    # --- 3. Feature Selection and Splitting Data ---
    # Using relevant features to predict rank
    features = ['year', 'rating', 'genre', 'run_time_minutes', 'budget', 'box_office']
    X = df[features]
    y = df['rank']  # The target is now the movie rank

    X.dropna(inplace=True)
    y = y[X.index] # Ensure target variable aligns with features after dropping NaNs

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- 4. Building the Model Pipeline ---
    categorical_features = ['rating', 'genre']
    numerical_features = ['year', 'run_time_minutes', 'budget', 'box_office']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    # --- 5. Train and Evaluate the Model ---
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse) # Calculate Root Mean Squared Error
    r2 = r2_score(y_test, y_pred)

    print("\nModel Performance:")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared: {r2:.2f}")

    # --- 6. Save the Trained Model (Optional) ---
    model_filename = 'movie_rank_predictor.joblib'
    joblib.dump(model, model_filename)
    print(f"\nModel saved to '{model_filename}'.")

except FileNotFoundError:
    print("Error: 'movie_dataset.csv' not found. Please upload the file to your Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset loaded successfully.

Model Performance:
Mean Squared Error (MSE): 502.04
Root Mean Squared Error (RMSE): 22.41
R-squared: 0.89

Model saved to 'movie_rank_predictor.joblib'.


Based on the columns available in the `movie_dataset.csv` and the types of data they contain, here are some machine learning models you could build:

*   **Predicting Box Office Success (Classification):** As you've already done, you can build a binary classification model to predict if a movie will be a financial success (e.g., box office > budget). You can use algorithms like:
    *   Logistic Regression
    *   Support Vector Machines (SVM)
    *   Decision Trees
    *   Random Forests (which you've used)
    *   Gradient Boosting Machines (like LightGBM or XGBoost)
*   **Predicting Box Office Revenue (Regression):** You can build a regression model to predict the actual box office revenue a movie will generate. You can use algorithms like:
    *   Linear Regression
    *   Ridge, Lasso, or Elastic Net Regression
    *   Support Vector Regression (SVR)
    *   Decision Tree Regression
    *   Random Forest Regressor (which you've used)
    *   Gradient Boosting Regressors
*   **Predicting Movie Rating:** You could potentially treat the rating as a classification problem (predicting a rating category) or a regression problem (predicting the numerical rating).
    *   **Classification (Rating Categories):** Use classifiers listed above.
    *   **Regression (Numerical Rating):** Use regressors listed above.
*   **Predicting Genre:** If you wanted to predict the genre based on other features (though this might be less practical as genre is usually known beforehand), you could treat this as a multi-class classification problem using algorithms like:
    *   Multinomial Logistic Regression
    *   Decision Trees
    *   Random Forests
    *   Naive Bayes
*   **Recommending Movies:** Based on features like genre, rating, cast, directors, etc., you could build a recommendation system. This could involve various techniques, including:
    *   Collaborative Filtering
    *   Content-Based Filtering
    *   Matrix Factorization (e.g., using techniques like Singular Value Decomposition)

To build these models, you would typically follow these general steps:

1.  **Load the data:** Read the `movie_dataset.csv` file.
2.  **Explore and preprocess the data:** Handle missing values, convert data types, and encode categorical features (like 'rating', 'genre', 'casts', 'directors', 'writers').
3.  **Feature Engineering:** Create new features from existing ones if needed (e.g., extracting the number of genres, or analyzing cast/director/writer popularity).
4.  **Split the data:** Divide the data into training and testing sets.
5.  **Choose a model:** Select an appropriate machine learning algorithm based on the problem you are trying to solve (classification, regression, etc.).
6.  **Train the model:** Fit the model to the training data.
7.  **Evaluate the model:** Assess the model's performance using relevant metrics.
8.  **Tune hyperparameters:** Optimize the model's parameters for better performance.
9.  **Make predictions:** Use the trained model to make predictions on new, unseen data.

The specific model you choose and how you preprocess the data will depend on the exact problem you want to solve and the desired outcome.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np
import ast  # Library to safely evaluate string representations of Python literals
import warnings

warnings.filterwarnings('ignore')

# --- 1. Load the New, Unbiased Dataset ---
try:
    df = pd.read_csv('tmdb_5000_movies.csv')
    print("TMDB 5000 dataset loaded successfully.")

    # --- 2. Advanced Data Cleaning and Preparation ---
    print("Starting data cleaning and preparation...")

    # A. Handle missing financial data (treat 0 as missing)
    df['budget'].replace(0, np.nan, inplace=True)
    df['revenue'].replace(0, np.nan, inplace=True)
    df.dropna(subset=['budget', 'revenue'], inplace=True)
    print(f"Removed rows with missing financial data. Shape is now: {df.shape}")

    # B. Parse the JSON 'genres' column
    def parse_json_column(column_str):
        try:
            items = ast.literal_eval(column_str)
            names = [item['name'] for item in items]
            # Handle cases where genres might be empty
            if not names:
                return np.nan
            return ','.join(names)
        except (ValueError, SyntaxError, TypeError):
            return np.nan # Return NaN if parsing fails

    df['genres'] = df['genres'].apply(parse_json_column)

    # C. Handle release date
    # Some release dates might be malformed, so use errors='coerce'
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
    df.dropna(subset=['release_date', 'genres', 'runtime'], inplace=True)
    df['year'] = df['release_date'].dt.year

    # D. Rename columns to match the original script's expectations
    df.rename(columns={
        'revenue': 'box_office',
        'vote_average': 'rating',
        'runtime': 'run_time_minutes' # Runtime is already in minutes, just needs renaming
    }, inplace=True)

    print("Data cleaning complete.")

    # --- 3. Feature Selection and Splitting Data ---
    # Select the final features for the model
    features = ['year', 'rating', 'genres', 'run_time_minutes', 'budget']
    X = df[features]
    y = df['box_office']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split into training ({X_train.shape[0]} rows) and testing ({X_test.shape[0]} rows) sets.")

    # --- 4. Building the Model Pipeline (same as before) ---
    categorical_features = ['genres'] # Rating is now treated as a numerical feature
    numerical_features = ['year', 'rating', 'run_time_minutes', 'budget']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    print("Model pipeline created.")

    # --- 5. Train and Evaluate the New Model ---
    print("Training the new model...")
    model.fit(X_train, y_train)
    print("Model training complete.")

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\n--- New Model Performance ---")
    print(f"Mean Squared Error (MSE): {mse:,.2f}")
    print(f"R-squared (R²): {r2:.2f}")

    # --- 6. Save the Newly Trained Model ---
    model_filename = 'movie_revenue_predictor.joblib'
    joblib.dump(model, model_filename)

    print(f"\nNew, improved model saved to '{model_filename}'.")
    print("You can now replace the old model file in your Flask app with this new one.")

except FileNotFoundError:
    print("Error: 'tmdb_5000_movies.csv' not found. Please ensure it's uploaded.")
except Exception as e:
    print(f"An error occurred: {e}")


TMDB 5000 dataset loaded successfully.
Starting data cleaning and preparation...
Removed rows with missing financial data. Shape is now: (3229, 20)
Data cleaning complete.
Data split into training (2582 rows) and testing (646 rows) sets.
Model pipeline created.
Training the new model...
Model training complete.

--- New Model Performance ---
Mean Squared Error (MSE): 20,013,921,597,324,100.00
R-squared (R²): 0.60

New, improved model saved to 'movie_revenue_predictor.joblib'.
You can now replace the old model file in your Flask app with this new one.
