In [16]:
# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:.2f}".format)

# Plot style (optional)
sns.set(style="whitegrid")


In [17]:
# machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

In [18]:
# load dataset
df = pd.read_csv("/Users/DadaOlayemi/Gaming_Habits/data/Gaming_Habits.csv")
df.head()

Unnamed: 0,User_ID,Age,Gender,Occupation,Game_Type,Daily_Gaming_Hours,Weekly_Gaming_Hours,Primary_Gaming_Time,Sleep_Hours,Stress_Level,Focus_Level,Academic_or_Work_Score,Productivity_Level,Performance_Impact
0,U0001,21,Male,Working Professional,Action,4.0,28.0,Morning,4.6,6,4,69,66,Negative
1,U0002,35,Female,Student,Sports,1.0,7.0,Night,5.4,2,7,67,72,Neutral
2,U0003,26,Male,Student,Puzzle,2.0,14.0,Morning,8.0,4,8,82,82,Positive
3,U0004,32,Male,Working Professional,Action,1.0,7.0,Night,4.9,7,7,71,66,Neutral
4,U0005,19,Male,Working Professional,Action,2.1,14.7,Morning,7.0,7,7,67,63,Neutral


In [19]:
# quick sanity check
df.shape
df.info()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   User_ID                 1000 non-null   object 
 1   Age                     1000 non-null   int64  
 2   Gender                  1000 non-null   object 
 3   Occupation              1000 non-null   object 
 4   Game_Type               1000 non-null   object 
 5   Daily_Gaming_Hours      1000 non-null   float64
 6   Weekly_Gaming_Hours     1000 non-null   float64
 7   Primary_Gaming_Time     1000 non-null   object 
 8   Sleep_Hours             1000 non-null   float64
 9   Stress_Level            1000 non-null   int64  
 10  Focus_Level             1000 non-null   int64  
 11  Academic_or_Work_Score  1000 non-null   int64  
 12  Productivity_Level      1000 non-null   int64  
 13  Performance_Impact      1000 non-null   object 
dtypes: float64(3), int64(5), object(6)
memory

Index(['User_ID', 'Age', 'Gender', 'Occupation', 'Game_Type',
       'Daily_Gaming_Hours', 'Weekly_Gaming_Hours', 'Primary_Gaming_Time',
       'Sleep_Hours', 'Stress_Level', 'Focus_Level', 'Academic_or_Work_Score',
       'Productivity_Level', 'Performance_Impact'],
      dtype='object')

In [20]:
# check for missing values 
missing_values = df.isnull().sum()

# display columns with missing values
print("Columns with missing values")
print(missing_values[missing_values > 0])

Columns with missing values
Series([], dtype: int64)


# General overview
The dataset contains 1,000 observations and 14 variables, which is a solid sample size for exploratory analysis and basic machine learning.

There are no missing values, which means the data is already clean in terms of completeness.

In [21]:
df.columns

Index(['User_ID', 'Age', 'Gender', 'Occupation', 'Game_Type',
       'Daily_Gaming_Hours', 'Weekly_Gaming_Hours', 'Primary_Gaming_Time',
       'Sleep_Hours', 'Stress_Level', 'Focus_Level', 'Academic_or_Work_Score',
       'Productivity_Level', 'Performance_Impact'],
      dtype='object')

In [22]:
# basic descriptive statistics
df.describe()

Unnamed: 0,Age,Daily_Gaming_Hours,Weekly_Gaming_Hours,Sleep_Hours,Stress_Level,Focus_Level,Academic_or_Work_Score,Productivity_Level
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,26.6,3.32,23.24,6.48,5.45,6.07,75.08,75.28
std,5.21,1.61,11.29,1.16,2.28,2.01,11.67,12.06
min,18.0,0.5,3.5,4.5,2.0,3.0,55.0,50.0
25%,22.0,2.0,14.0,5.47,3.0,4.0,65.0,65.0
50%,27.0,3.3,23.1,6.5,5.0,6.0,75.0,75.0
75%,31.0,4.8,33.6,7.5,7.0,8.0,85.0,86.0
max,35.0,6.0,42.0,8.5,9.0,9.0,95.0,100.0


In [23]:
# for categorical variables
df.describe(include="object")

Unnamed: 0,User_ID,Gender,Occupation,Game_Type,Primary_Gaming_Time,Performance_Impact
count,1000,1000,1000,1000,1000,1000
unique,1000,2,2,6,3,3
top,U0001,Female,Student,Simulation,Morning,Neutral
freq,1,513,524,192,341,762


In [27]:
# Example: drop non-informative column first
df_processed = df.drop(columns=["User_ID"])  # optional, depends on your cleaning

# Save to a CSV file in your data folder
df_processed.to_csv("/Users/DadaOlayemi/Gaming_Habits/processed/Gaming_Habits_processed.csv", index=False)

print("Processed dataset saved successfully!")

Processed dataset saved successfully!
