# Video Game Success Analysis

This notebook analyzes the extended video game dataset to understand factors contributing to game success.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_theme()

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

## Load and Explore the Data

In [None]:
# Read the extended data
df = pd.read_csv('games_expanded.csv')

# Display basic information about the dataset
print("Dataset Info:")
df.info()

In [None]:
# Display the first few rows
df.head()

## Data Quality Check

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

missing_data[missing_data['Missing Values'] > 0].sort_values('Missing Values', ascending=False)

## Game Success Metrics Analysis

In [None]:
# Analyze rating distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='rating', bins=30)
plt.title('Distribution of Game Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# Top rated games
print("\nTop 10 Highest Rated Games:")
df.nlargest(10, 'rating')[['name', 'rating', 'rating_count']]

## Genre Analysis

In [None]:
# Analyze genres
genre_counts = df['genres'].str.split(',').explode().str.strip().value_counts()

plt.figure(figsize=(15, 8))
genre_counts.head(15).plot(kind='bar')
plt.title('Top 15 Most Common Game Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Average rating by genre
genre_ratings = df.explode('genres').groupby('genres')['rating'].agg(['mean', 'count'])
genre_ratings = genre_ratings[genre_ratings['count'] > 50]  # Filter genres with sufficient data
genre_ratings.sort_values('mean', ascending=False).head(10)

## Platform Analysis

In [None]:
# Analyze platforms
platform_counts = df['platforms'].str.split(',').explode().str.strip().value_counts()

plt.figure(figsize=(15, 8))
platform_counts.head(15).plot(kind='bar')
plt.title('Top 15 Most Common Platforms')
plt.xlabel('Platform')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Average rating by platform
platform_ratings = df.explode('platforms').groupby('platforms')['rating'].agg(['mean', 'count'])
platform_ratings = platform_ratings[platform_ratings['count'] > 50]  # Filter platforms with sufficient data
platform_ratings.sort_values('mean', ascending=False).head(10)

## Release Year Analysis

In [None]:
# Convert release date to datetime and extract year
df['release_year'] = pd.to_datetime(df['first_release_date'], unit='s').dt.year

# Games released per year
plt.figure(figsize=(15, 6))
df['release_year'].value_counts().sort_index().plot(kind='line')
plt.title('Number of Games Released per Year')
plt.xlabel('Year')
plt.ylabel('Number of Games')
plt.grid(True)
plt.show()

# Average rating by year
yearly_ratings = df.groupby('release_year')['rating'].agg(['mean', 'count'])
yearly_ratings = yearly_ratings[yearly_ratings['count'] > 50]  # Filter years with sufficient data

plt.figure(figsize=(15, 6))
yearly_ratings['mean'].plot(kind='line')
plt.title('Average Game Rating by Release Year')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

## Correlation Analysis

In [None]:
# Select numerical columns for correlation analysis
numerical_cols = df.select_dtypes(include=[np.number]).columns

# Create correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

## Feature Engineering for Prediction

In [30]:
# Handle missing values
df['hypes'] = df['hypes'].fillna(0)  # Fill missing values with 0
df['has_hypes'] = (df['hypes'] > 0).astype(int)  # Create binary feature for hypes presence
df['rating_count'] = df['rating_count'].fillna(0)  # Fill missing rating counts with 0
df['total_rating_count'] = df['total_rating_count'].fillna(0)  # Fill missing total rating counts with 0

# Remove rows where rating (target variable) is NaN
df = df.dropna(subset=['rating'])

# Prepare genre features
genre_dummies = df['genres'].str.get_dummies(',')

# Prepare platform features
platform_dummies = df['platforms'].str.get_dummies(',')

# Print shapes to debug
print("Shape of df:", df.shape)
print("Shape of genre_dummies:", genre_dummies.shape)
print("Shape of platform_dummies:", platform_dummies.shape)

# Combine features
feature_cols = ['rating_count', 'hypes', 'has_hypes', 'total_rating_count']
X = pd.concat([
    df[feature_cols],
    genre_dummies,
    platform_dummies
], axis=1)

# Target variable
y = df['rating']

# Print final shapes
print("\nFinal shapes:")
print("X shape:", X.shape)
print("y shape:", y.shape)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print(f'R2 Score: {r2_score(y_test, y_pred):.3f}')
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}')

Shape of df: (25621, 23)
Shape of genre_dummies: (25621, 44)
Shape of platform_dummies: (25621, 134)

Final shapes:
X shape: (25621, 182)
y shape: (25621,)
R2 Score: 0.191
Root Mean Squared Error: 12.575


## Genre and Platform Impact Analysis

In [None]:
# Analyze impact of genre combinations
genre_combinations = df['genres'].str.split(',').apply(lambda x: tuple(sorted(x)) if isinstance(x, list) else x)
genre_comb_ratings = df.groupby(genre_combinations)['rating'].agg(['mean', 'count'])
genre_comb_ratings = genre_comb_ratings[genre_comb_ratings['count'] > 20]  # Filter for sufficient data
print("\nTop 10 Genre Combinations by Average Rating:")
print(genre_comb_ratings.sort_values('mean', ascending=False).head(10))
## Could maybe explore what the best genres for indie games are
    

# Analyze platform combinations
platform_combinations = df['platforms'].str.split(',').apply(lambda x: tuple(sorted(x)) if isinstance(x, list) else x)
platform_comb_ratings = df.groupby(platform_combinations)['rating'].agg(['mean', 'count'])
platform_comb_ratings = platform_comb_ratings[platform_comb_ratings['count'] > 20]
print("\nTop 10 Platform Combinations by Average Rating:")
print(platform_comb_ratings.sort_values('mean', ascending=False).head(10))

## Temporal Analysis of Rating Patterns

In [None]:
# Analyze rating trends over time
df['release_year'] = pd.to_datetime(df['first_release_date'], unit='s').dt.year

# Calculate yearly statistics
yearly_stats = df.groupby('release_year').agg({
    'rating': ['mean', 'std', 'count'],
    'rating_count': 'mean',
    'hypes': 'mean'
})

# Plot rating trends
plt.figure(figsize=(15, 8))
plt.plot(yearly_stats.index, yearly_stats[('rating', 'mean')], label='Average Rating')
plt.fill_between(yearly_stats.index,
                 yearly_stats[('rating', 'mean')] - yearly_stats[('rating', 'std')],
                 yearly_stats[('rating', 'mean')] + yearly_stats[('rating', 'std')],
                 alpha=0.2)
plt.title('Game Rating Trends Over Time')
plt.xlabel('Release Year')
plt.ylabel('Average Rating')
plt.legend()
plt.grid(True)
plt.show()

# Analyze correlation between engagement metrics and ratings
engagement_metrics = ['rating_count', 'hypes']
plt.figure(figsize=(12, 6))
sns.heatmap(df[engagement_metrics + ['rating']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Engagement Metrics and Ratings')
plt.show()

## Advanced Feature Analysis

In [None]:
# Analyze rating distribution by different features
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Rating vs Rating Count
sns.scatterplot(data=df, x='rating_count', y='rating', alpha=0.5, ax=axes[0,0])
axes[0,0].set_title('Rating vs Number of Ratings')

# Rating vs Hypes
sns.scatterplot(data=df, x='hypes', y='rating', alpha=0.5, ax=axes[1,0])
axes[1,0].set_title('Rating vs Number of Hypes')

# Rating vs Total Rating Count
sns.scatterplot(data=df, x='total_rating_count', y='rating', alpha=0.5, ax=axes[1,1])
axes[1,1].set_title('Rating vs Total Rating Count')

plt.tight_layout()
plt.show()

# Calculate correlation coefficients
correlation_features = ['rating', 'rating_count', 'hypes', 'total_rating_count']
correlation_matrix = df[correlation_features].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix['rating'].sort_values(ascending=False))

## Advanced Modeling Techniques

In [24]:
# Import additional libraries for advanced modeling
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline

# Create a pipeline with scaling
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', None)  # Will be set for each model
])

## Data Cleaning

In [None]:
# Data Cleaning and Preprocessing Cell

# 1. Handle missing values
print("Handling missing values...")
df['hypes'] = df['hypes'].fillna(0)  # Fill missing values with 0
df['has_hypes'] = (df['hypes'] > 0).astype(int)  # Create binary feature for hypes presence
df['rating_count'] = df['rating_count'].fillna(0)  # Fill missing rating counts with 0
df['total_rating_count'] = df['total_rating_count'].fillna(0)  # Fill missing total rating counts with 0

# 2. Remove rows where rating (target variable) is NaN
print("Removing rows with missing target values...")
df = df.dropna(subset=['rating'])

# 3. Prepare genre features
print("Creating genre dummies...")
genre_dummies = df['genres'].str.get_dummies(',')

# 4. Prepare platform features
print("Creating platform dummies...")
platform_dummies = df['platforms'].str.get_dummies(',')

# 5. Print shapes to debug
print("\nData shapes:")
print("Shape of df:", df.shape)
print("Shape of genre_dummies:", genre_dummies.shape)
print("Shape of platform_dummies:", platform_dummies.shape)

# 6. Combine features
print("\nCombining features...")
feature_cols = ['rating_count', 'hypes', 'has_hypes', 'total_rating_count']
X = pd.concat([
    df[feature_cols],
    genre_dummies,
    platform_dummies
], axis=1)

# 7. Check for duplicate features
print("\nChecking for duplicate features...")
print("Number of features before removing duplicates:", X.shape[1])
duplicate_cols = X.columns[X.columns.duplicated()]
if len(duplicate_cols) > 0:
    print("Duplicate features found:", duplicate_cols.tolist())
    # Remove duplicate features
    X = X.loc[:, ~X.columns.duplicated()]
    print("Number of features after removing duplicates:", X.shape[1])

# 8. Target variable
y = df['rating']

# 9. Print final shapes
print("\nFinal data shapes:")
print("X shape:", X.shape)
print("y shape:", y.shape)

# 10. Split the data
print("\nSplitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# 11. Create pipeline with scaling
print("\nCreating preprocessing pipeline...")
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', None)  # Will be set for each model
])

print("\nData cleaning and preprocessing complete! Ready for model training.")

### XGBoost Model with Hyperparameter Tuning

In [None]:
# XGBoost Model Cell
print("Training XGBoost model...")
xgb_params = {
    'model': [XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        n_jobs=-1
    )]
}

xgb_grid = GridSearchCV(
    pipeline,
    xgb_params,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

xgb_grid.fit(X_train, y_train)
print(f"Best XGBoost parameters: {xgb_grid.best_params_}")
print(f"Best XGBoost cross-validation score: {xgb_grid.best_score_:.3f}")

# Evaluate on test set
xgb_pred = xgb_grid.predict(X_test)
print(f"XGBoost test set R2 score: {r2_score(y_test, xgb_pred):.3f}")
print(f"XGBoost test set RMSE: {np.sqrt(mean_squared_error(y_test, xgb_pred)):.3f}")

# Feature importance
xgb_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_grid.best_estimator_.named_steps['model'].feature_importances_
})
xgb_importance = xgb_importance.sort_values('importance', ascending=False)

# Plot top 20 most important features
plt.figure(figsize=(12, 6))
sns.barplot(data=xgb_importance.head(20), x='importance', y='feature')
plt.title('Top 20 Most Important Features (XGBoost)')
plt.tight_layout()
plt.show()

### LightGBM Model with Hyperparameter Tuning

In [None]:
# LightGBM Model Cell
print("Training LightGBM model...")
lgbm_params = {
    'model': [LGBMRegressor(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=4,
        num_leaves=16,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )]
}

lgbm_grid = GridSearchCV(
    pipeline,
    lgbm_params,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

lgbm_grid.fit(X_train, y_train)
print(f"Best LightGBM parameters: {lgbm_grid.best_params_}")
print(f"Best LightGBM cross-validation score: {lgbm_grid.best_score_:.3f}")

# Evaluate on test set
lgbm_pred = lgbm_grid.predict(X_test)
print(f"LightGBM test set R2 score: {r2_score(y_test, lgbm_pred):.3f}")
print(f"LightGBM test set RMSE: {np.sqrt(mean_squared_error(y_test, lgbm_pred)):.3f}")

# Feature importance
lgbm_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': lgbm_grid.best_estimator_.named_steps['model'].feature_importances_
})
lgbm_importance = lgbm_importance.sort_values('importance', ascending=False)

# Plot top 20 most important features
plt.figure(figsize=(12, 6))
sns.barplot(data=lgbm_importance.head(20), x='importance', y='feature')
plt.title('Top 20 Most Important Features (LightGBM)')
plt.tight_layout()
plt.show()

### Neural Network Model with Hyperparameter Tuning

In [None]:
# Neural Network Model Cell
print("Training Neural Network model...")
mlp_params = {
    'model': [MLPRegressor(
        hidden_layer_sizes=(100,),
        learning_rate='constant',
        max_iter=200,
        activation='relu',
        solver='adam',
        random_state=42,
        verbose=True
    )]
}

mlp_grid = GridSearchCV(
    pipeline,
    mlp_params,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

mlp_grid.fit(X_train, y_train)
print(f"Best Neural Network parameters: {mlp_grid.best_params_}")
print(f"Best Neural Network cross-validation score: {mlp_grid.best_score_:.3f}")

# Evaluate on test set
mlp_pred = mlp_grid.predict(X_test)
print(f"Neural Network test set R2 score: {r2_score(y_test, mlp_pred):.3f}")
print(f"Neural Network test set RMSE: {np.sqrt(mean_squared_error(y_test, mlp_pred)):.3f}")

### Model Comparison and Ensemble

In [None]:
# Compare model performances
models = {
    'XGBoost': xgb_pred,
    'LightGBM': lgb_pred,
    'Neural Network': nn_pred
}

# Calculate metrics for each model
results = pd.DataFrame({
    'Model': [],
    'R2 Score': [],
    'RMSE': []
})

for name, pred in models.items():
    results = results.append({
        'Model': name,
        'R2 Score': r2_score(y_test, pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, pred))
    }, ignore_index=True)

# Display results
print("Model Comparison:")
print(results.sort_values('R2 Score', ascending=False))

# Create ensemble prediction (simple average)
ensemble_pred = np.mean([xgb_pred, lgb_pred, nn_pred], axis=0)
print(f"\nEnsemble Model R2 Score: {r2_score(y_test, ensemble_pred):.3f}")
print(f"Ensemble Model RMSE: {np.sqrt(mean_squared_error(y_test, ensemble_pred)):.3f}")

# Plot actual vs predicted values for the best model
best_model = results.loc[results['R2 Score'].idxmax(), 'Model']
best_pred = models[best_model]

plt.figure(figsize=(10, 6))
plt.scatter(y_test, best_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title(f'Actual vs Predicted Ratings ({best_model})')
plt.tight_layout()
plt.show()

### Feature Importance Analysis

In [None]:
# Get feature importance from the best model
if best_model in ['XGBoost', 'LightGBM']:
    best_model_obj = xgb_grid if best_model == 'XGBoost' else lgb_grid
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model_obj.best_estimator_.named_steps['model'].feature_importances_
    })
    
    # Plot feature importance
    plt.figure(figsize=(12, 6))
    sns.barplot(data=feature_importance.sort_values('importance', ascending=False).head(20),
                x='importance', y='feature')
    plt.title(f'Top 20 Most Important Features ({best_model})')
    plt.tight_layout()
    plt.show()