# Video Game Popularity Prediction
This notebook analyzes video game sales data to predict game popularity using machine learning.

In [7]:
# CELL 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report # Make sure classification_report is imported
import pickle
import os

In [8]:
# CELL 2: Function Definition
def train_and_save_model(data_path='video games sales.csv', model_output_path='game_popularity_model.pkl'):
    """
    Loads data, preprocesses, trains a model, and saves the model,
    encoders, feature names, and performance metrics to a .pkl file 
    with the correct structure for Django.
    """
    print(f"Starting process...")
    print(f"Loading dataset from: {data_path}")
    try:
        df = pd.read_csv(data_path)
    except FileNotFoundError:
        print(f"Error: The dataset file '{data_path}' was not found. Please ensure it's in the same directory as the notebook.")
        return

    print("Dataset loaded successfully.")
    print(f"Initial dataset shape: {df.shape}")

    # --- Preprocessing ---
    print("\n--- Starting Preprocessing ---")
    df.dropna(inplace=True) # Drop rows with any missing values
    print(f"Dataset shape after dropping initial NaNs: {df.shape}")

    if 'Year' in df.columns:
        df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
        df.dropna(subset=['Year'], inplace=True)
        df['Year'] = df['Year'].astype(int)
    else:
        print("Warning: 'Year' column not found.")

    if 'Global_Sales' not in df.columns or not pd.api.types.is_numeric_dtype(df['Global_Sales']):
        print("Error: 'Global_Sales' column is missing or not numeric. Cannot define 'Popularity'.")
        return
        
    median_global_sales = df['Global_Sales'].median()
    df['Popularity'] = (df['Global_Sales'] >= median_global_sales).astype(int)
    print(f"\nPopularity threshold (median Global_Sales): {median_global_sales}")

    # --- Feature Engineering & Encoding ---
    print("\n--- Starting Feature Encoding ---")
    categorical_features = ['Platform', 'Genre', 'Publisher']
    encoders_dict = {} 

    for col in categorical_features:
        if col not in df.columns:
            print(f"Warning: Column '{col}' for encoding not found. Skipping.")
            continue
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[f'{col}_Encoded'] = le.fit_transform(df[col])
        encoders_dict[col] = le 
        print(f"Encoded '{col}'.")

    # --- Define Features (X) and Target (y) ---
    print("\n--- Defining Features (X) and Target (y) ---")
    sales_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
    for scol in sales_columns:
        if scol in df.columns:
            if not pd.api.types.is_numeric_dtype(df[scol]):
                df[scol] = pd.to_numeric(df[scol], errors='coerce')
        else:
            print(f"Warning: Sales column '{scol}' not found.")

    feature_names_potential = [
        'Platform_Encoded', 'Genre_Encoded', 'Publisher_Encoded',
        'Year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'
    ]
    
    feature_names = [] 
    for fname in feature_names_potential:
        if fname in df.columns and pd.api.types.is_numeric_dtype(df[fname]):
            feature_names.append(fname)
        else:
            print(f"Warning: Feature '{fname}' not found or not numeric. Excluded.")

    if not feature_names: 
        print("Error: No valid numeric features found for X. Cannot train model.")
        return
        
    df.dropna(subset=feature_names + ['Popularity'], inplace=True)
    if df.empty or df[feature_names].empty or df['Popularity'].empty:
        print("Error: DataFrame or necessary columns for X/y are empty. Cannot proceed.")
        return

    X = df[feature_names]
    y = df['Popularity']
    print(f"\nSelected features for training: {feature_names}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")

    if X.empty or y.empty: 
        print("Error: Features (X) or target (y) is empty before split. Cannot proceed.")
        return

    # --- Train-Test Split ---
    print("\n--- Splitting Data ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

    # --- Model Training ---
    print("\n--- Starting Model Training ---")
    model = RandomForestClassifier(n_estimators=100, random_state=42) 
    model.fit(X_train, y_train)
    print("Model training complete.")

    # --- Model Evaluation (Define 'accuracy' and 'classification_report_dict' HERE) ---
    print("\n--- Starting Model Evaluation ---") 
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred) # 'accuracy' is DEFINED
    print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")
    print("Classification Report on Test Set (for notebook viewing):")
    print(classification_report(y_test, y_pred, zero_division=0)) 
    # Define 'classification_report_dict' for saving in the pickle file
    classification_report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0) 

    # --- Prepare and Save Model Package ---
    print(f"\n--- Saving model, encoders, feature names, and metrics to {model_output_path} ---")
    
    platform_le = encoders_dict.get('Platform')
    genre_le = encoders_dict.get('Genre')
    publisher_le = encoders_dict.get('Publisher')
        
    if not all([model, platform_le, genre_le, publisher_le, feature_names, 'accuracy' in locals(), 'classification_report_dict' in locals()]):
        print("CRITICAL Error: Not all components (model, encoders, features, accuracy, classification_report_dict) are defined. CANNOT SAVE PACKAGE.")
        if not ('model' in locals()): print("- 'model' is missing")
        if not ('platform_le' in locals()): print("- 'platform_le' is missing")
        if not ('genre_le' in locals()): print("- 'genre_le' is missing")
        if not ('publisher_le' in locals()): print("- 'publisher_le' is missing")
        if not ('feature_names' in locals()): print("- 'feature_names' is missing")
        if not ('accuracy' in locals()): print("- 'accuracy' is missing")
        if not ('classification_report_dict' in locals()): print("- 'classification_report_dict' is missing")
        return

    model_package = {
        'model': model,
        'encoders': { 
            'Platform': platform_le,
            'Genre': genre_le,
            'Publisher': publisher_le
        },
        'feature_names': feature_names, 
        'unique_platforms_classes': platform_le.classes_.tolist() if platform_le else [],
        'unique_genres_classes': genre_le.classes_.tolist() if genre_le else [],
        'unique_publishers_classes': publisher_le.classes_.tolist() if publisher_le else [],
        
        'accuracy': accuracy,                         
        'classification_report_dict': classification_report_dict  
    }

    try:
        with open(model_output_path, 'wb') as f:
            pickle.dump(model_package, f)
        print(f"\nModel, encoders, feature names, and metrics successfully saved to {model_output_path}")
        print("Structure of saved package:")
        print(f"  Top-level keys: {list(model_package.keys())}")
        if 'encoders' in model_package and isinstance(model_package['encoders'], dict):
            print(f"  Keys within 'encoders': {list(model_package['encoders'].keys())}")
        else:
             print(f"  Problem with 'encoders' key: {model_package.get('encoders')}")
        print(f"  Feature names saved: {model_package.get('feature_names')}")
        print(f"  Accuracy saved: {model_package.get('accuracy')}")
        print(f"  Classification report dict saved: {'Yes' if 'classification_report_dict' in model_package else 'No'}")

    except Exception as e:
        print(f"Error during model saving: {e}")

In [9]:
# CELL 3: Function Call
# Ensure 'video games sales.csv' is in the same directory as this notebook.
print("Attempting to run the full model training and saving process...")
train_and_save_model() # This line calls the function defined in Cell 2
print("\nFull process finished.")

Attempting to run the full model training and saving process...
Starting process...
Loading dataset from: video games sales.csv
Dataset loaded successfully.
Initial dataset shape: (16598, 11)

--- Starting Preprocessing ---
Dataset shape after dropping initial NaNs: (16291, 11)

Popularity threshold (median Global_Sales): 0.17

--- Starting Feature Encoding ---
Encoded 'Platform'.
Encoded 'Genre'.
Encoded 'Publisher'.

--- Defining Features (X) and Target (y) ---

Selected features for training: ['Platform_Encoded', 'Genre_Encoded', 'Publisher_Encoded', 'Year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
X shape: (16291, 8), y shape: (16291,)

--- Splitting Data ---
X_train shape: (13032, 8), y_train shape: (13032,)

--- Starting Model Training ---
Model training complete.

--- Starting Model Evaluation ---

Model Accuracy on Test Set: 0.9871
Classification Report on Test Set (for notebook viewing):
              precision    recall  f1-score   support

           0       0.98  