# Multiplayer Game Engagement Analysis

This notebook performs EDA and computes KPIs for the Kaggle dataset (Predict Online Gaming Behavior).

- Load and clean data
- Compute KPIs (session duration, sessions/week, retention, monetization)
- Segment and cohort-style views (GameGenre, Location, EngagementLevel)
- Visualizations (histogram, boxplot, heatmap, bar charts)



In [None]:
# Imports and settings
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid", context="notebook")
plt.rcParams["figure.figsize"] = (10, 6)

DATA_PATH = os.getenv("ENGAGEMENT_DATA_PATH", "../data/engagement_data.csv")

print(f"Using data path: {DATA_PATH}")


In [None]:
# Load data
try:
    df = pd.read_csv(DATA_PATH)
    df.columns = [c.strip() for c in df.columns]
    print(df.shape)
    display(df.head())
except FileNotFoundError:
    print("Data file not found. Proceeding with a small dummy dataframe for demo purposes.")
    df = pd.DataFrame({
        'PlayerID': range(1, 31),
        'Age': np.random.randint(13, 50, 30),
        'Gender': np.random.choice(['Male','Female','Other'], 30),
        'Location': np.random.choice(['NA','EU','APAC','LATAM'], 30),
        'GameGenre': np.random.choice(['FPS','MOBA','RPG','Sports'], 30),
        'PlayTimeHours': np.random.gamma(5, 2, 30).round(2),
        'InGamePurchases': np.random.poisson(2, 30),
        'GameDifficulty': np.random.choice(['Easy','Normal','Hard'], 30),
        'SessionsPerWeek': np.random.normal(5, 2, 30).clip(0).round(1),
        'AvgSessionDurationMinutes': np.random.normal(45, 15, 30).clip(5).round(1),
        'PlayerLevel': np.random.randint(1, 60, 30),
        'AchievementsUnlocked': np.random.randint(0, 50, 30),
        'EngagementLevel': np.random.choice(['Low','Medium','High'], 30, p=[0.3,0.5,0.2])
    })
    display(df.head())


In [None]:
# Basic cleaning
# Strip strings and coerce numerics
for col in ['GameGenre','Location','Gender','GameDifficulty','EngagementLevel']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

for col in ['PlayTimeHours','InGamePurchases','SessionsPerWeek','AvgSessionDurationMinutes','PlayerLevel','AchievementsUnlocked','Age']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Missing values overview
missing = df.isna().mean().sort_values(ascending=False)
print("Missing ratio by column:\n", missing)

# Simple imputation for demo purposes
for col in df.columns:
    if df[col].dtype.kind in 'biufc':
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna('Unknown')


In [None]:
# KPI computations
avg_session = df['AvgSessionDurationMinutes'].mean()
sessions_week = df['SessionsPerWeek'].mean()
avg_purchases = df['InGamePurchases'].mean()
retention = (df['EngagementLevel'].str.title() == 'High').mean()

print({
    'average_session_duration_minutes': round(avg_session,2),
    'average_sessions_per_week': round(sessions_week,2),
    'average_purchases_per_user': round(avg_purchases,2),
    'retention_rate_high_engagement': round(retention,4)
})


In [None]:
# Segment and cohort-style analysis
seg_cols = ['GameGenre','Location','EngagementLevel']
cohort = df.groupby(seg_cols).agg({
    'AvgSessionDurationMinutes':'mean',
    'SessionsPerWeek':'mean',
    'InGamePurchases':['mean','sum'],
    'PlayerID':'count'
})
cohort.columns = ['_'.join([str(c) for c in col if c!='']).strip('_') for col in cohort.columns.values]
cohort = cohort.rename(columns={'PlayerID_count':'num_players'}).reset_index()
cohort.head()


In [None]:
# Visualizations
# 1) Histogram of Avg Session Duration
sns.histplot(df['AvgSessionDurationMinutes'], bins=20, kde=True)
plt.title('Histogram: Avg Session Duration (minutes)')
plt.xlabel('Minutes')
plt.ylabel('Count')
plt.show()

# 2) Boxplot: Purchases by Engagement Level
sns.boxplot(data=df, x=df['EngagementLevel'].str.title(), y='InGamePurchases')
plt.title('In-Game Purchases by Engagement Level')
plt.xlabel('Engagement Level')
plt.ylabel('Purchases')
plt.show()

# 3) Heatmap: Cohort retention proxy (share of High engagement)
# Pivot: rows=GameGenre, cols=Location, values=proportion High
pivot = df.assign(High=(df['EngagementLevel'].str.title()=='High')).pivot_table(
    index='GameGenre', columns='Location', values='High', aggfunc='mean')
sns.heatmap(pivot, annot=True, fmt='.2f', cmap='YlGnBu')
plt.title('Retention Proxy Heatmap (High Engagement Share)')
plt.show()

# 4) Bar plots: Genre and Region-wise engagement (mean session duration)
plt.figure()
sns.barplot(data=df, x='GameGenre', y='AvgSessionDurationMinutes', estimator=np.mean)
plt.title('Avg Session Duration by Genre')
plt.show()

plt.figure()
sns.barplot(data=df, x='Location', y='AvgSessionDurationMinutes', estimator=np.mean)
plt.title('Avg Session Duration by Location')
plt.show()


## Insights Summary

- High engagement users typically show higher session durations and purchase frequency.
- Certain genres (e.g., RPG/MOBA) may exhibit longer average sessions; regions differ in engagement.
- Heatmap helps spot genre-region combinations with stronger retention proxies.

Example statement: "Players with High EngagementLevel spent 25% more time and made 3x more purchases."


In [None]:
# Outlier detection (simple IQR method)
num_cols = ['AvgSessionDurationMinutes','SessionsPerWeek','InGamePurchases']
outlier_summary = {}
for col in num_cols:
    if col not in df.columns:
        continue
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    mask = (df[col] < lower) | (df[col] > upper)
    outlier_summary[col] = int(mask.sum())

print('Potential outliers (IQR method):', outlier_summary)


In [None]:
# Monetization proxies: ARPU and ARPPU
# ARPU: average purchases per user
# ARPPU: average purchases per paying user (or high engagement proxy)

# Paying user proxy: InGamePurchases > 0
if 'InGamePurchases' in df.columns:
    arpu = df['InGamePurchases'].mean()
    paying = df[df['InGamePurchases'] > 0]
    arppu_paying = paying['InGamePurchases'].mean() if not paying.empty else np.nan
else:
    arpu = np.nan
    arppu_paying = np.nan

# High engagement proxy
if 'EngagementLevel' in df.columns:
    high = df[df['EngagementLevel'].str.title() == 'High']
    arppu_high = high['InGamePurchases'].mean() if not high.empty else np.nan
else:
    arppu_high = np.nan

print({
    'ARPU': round(arpu, 2) if pd.notna(arpu) else np.nan,
    'ARPPU_paying_users': round(arppu_paying, 2) if pd.notna(arppu_paying) else np.nan,
    'ARPPU_high_engagement': round(arppu_high, 2) if pd.notna(arppu_high) else np.nan
})
