In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
# import dataset
df = pd.read_csv("data/online_gaming_behavior_dataset.csv")

# Display some rows to ensure data has loaded
df.head()

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium


In [3]:
# Feature Engineering

# AvgPlayTimePerWeek - Its just the product of average session duration and no. of sessions per week
df["AvgPlayTimePerWeek"] = df["AvgSessionDurationMinutes"] * df["SessionsPerWeek"]

# isAddicted - We check if the average playtime per week exceeds 20hrs, i.e., 1200 minutes. The result will be a boolean
df["isAddicted"] = df["AvgPlayTimePerWeek"] > 1200

print(df[["AvgSessionDurationMinutes", "SessionsPerWeek", "AvgPlayTimePerWeek", "isAddicted"]].head(10))

   AvgSessionDurationMinutes  SessionsPerWeek  AvgPlayTimePerWeek  isAddicted
0                        108                6                 648       False
1                        144                5                 720       False
2                        142               16                2272        True
3                         85                9                 765       False
4                        131                2                 262       False
5                         81                2                 162       False
6                         50                1                  50       False
7                         48               10                 480       False
8                        101                5                 505       False
9                         95               13                1235        True


In [4]:
df.head()

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel,AvgPlayTimePerWeek,isAddicted
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium,648,False
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium,720,False
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High,2272,True
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium,765,False
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium,262,False


In [5]:
# Encode Categorical columns

# Label encoding for columns with Binary labels
df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1})
df["isAddicted"] = df["isAddicted"].map({True: 1, False: 0})

# Ordinal encoding for columns with a Natural Order between labels
df["GameDifficulty"] = df["GameDifficulty"].map({"Easy": 0, "Medium": 1, "Hard": 2})
df["EngagementLevel"] = df["EngagementLevel"].map({"Low": 0, "Medium": 1, "High": 2})

# One-hot encoding for columns with multiple labels without any ordering between them
df = pd.get_dummies(df, columns=["Location", "GameGenre"], drop_first=True, prefix=["Location", "Genre"])

# We drop PlayTimeHours as it does not match AvgPlayTimePerWeek and AvgSessionDurationMinutes 
# so it doesn't make sense exactly what its measuring
df = df.drop(columns=["PlayTimeHours"])

In [6]:
df.head()

Unnamed: 0,PlayerID,Age,Gender,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel,AvgPlayTimePerWeek,isAddicted,Location_Europe,Location_Other,Location_USA,Genre_RPG,Genre_Simulation,Genre_Sports,Genre_Strategy
0,9000,43,0,0,1,6,108,79,25,1,648,0,False,True,False,False,False,False,True
1,9001,29,1,0,1,5,144,11,10,1,720,0,False,False,True,False,False,False,True
2,9002,22,1,0,0,16,142,35,41,2,2272,1,False,False,True,False,False,True,False
3,9003,35,0,1,0,9,85,57,47,1,765,0,False,False,True,False,False,False,False
4,9004,33,0,0,1,2,131,95,37,1,262,0,True,False,False,False,False,False,False


In [7]:
df.to_csv("data/online_gaming_behavior_dataset_preprocessed.csv", index=False)