In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import os

# 1.0 ETL (Extract, Transform, Load)
The data will be cleaned and prepared for statistical analysis, visualisation, and predictive modelling (linear regression).

## 1.1 Load the data

In [3]:
# Load the dataset
data_path = '../data/raw_data/student_social_media_addiction.csv'

try:
    df = pd.read_csv(data_path)
    print("✅ Dataset loaded successfully!")
except FileNotFoundError:
    print("❌ Dataset not found. Please download insurance.csv from Kaggle to data/raw/")
    raise

✅ Dataset loaded successfully!


## 1.2 Inspect the dataset
Before proceeding with further analysis, begin by inspecting the structure and basic properties of the dataset.

In [None]:
# Preview first 5 rows of the dataset
df.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7


In [8]:
# Check column names
df.columns

Index(['Student_ID', 'Age', 'Gender', 'Academic_Level', 'Country',
       'Avg_Daily_Usage_Hours', 'Most_Used_Platform',
       'Affects_Academic_Performance', 'Sleep_Hours_Per_Night',
       'Mental_Health_Score', 'Relationship_Status',
       'Conflicts_Over_Social_Media', 'Addicted_Score'],
      dtype='object')

In [None]:
# Check the shape of the dataset
df.shape

(705, 13)

In [7]:
# View column names and dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Student_ID                    705 non-null    int64  
 1   Age                           705 non-null    int64  
 2   Gender                        705 non-null    object 
 3   Academic_Level                705 non-null    object 
 4   Country                       705 non-null    object 
 5   Avg_Daily_Usage_Hours         705 non-null    float64
 6   Most_Used_Platform            705 non-null    object 
 7   Affects_Academic_Performance  705 non-null    object 
 8   Sleep_Hours_Per_Night         705 non-null    float64
 9   Mental_Health_Score           705 non-null    int64  
 10  Relationship_Status           705 non-null    object 
 11  Conflicts_Over_Social_Media   705 non-null    int64  
 12  Addicted_Score                705 non-null    int64  
dtypes: fl

In [None]:
# Check quantity of dtypes
df.dtypes.value_counts()

object     6
int64      5
float64    2
Name: count, dtype: int64

We will rely heavily on the statistical analysis of our dataset to help us spot patterns in our historical data, helping us spot patterns in causation v outcome.

From initial inspection there looks to be sign of outliers, these will be confirmed with box and whisker and histogram plots.

In [None]:
# Get statistical analysis of the data
df.describe()

Unnamed: 0,Student_ID,Age,Avg_Daily_Usage_Hours,Sleep_Hours_Per_Night,Mental_Health_Score,Conflicts_Over_Social_Media,Addicted_Score
count,705.0,705.0,705.0,705.0,705.0,705.0,705.0
mean,353.0,20.659574,4.918723,6.868936,6.22695,2.849645,6.436879
std,203.660256,1.399217,1.257395,1.126848,1.105055,0.957968,1.587165
min,1.0,18.0,1.5,3.8,4.0,0.0,2.0
25%,177.0,19.0,4.1,6.0,5.0,2.0,5.0
50%,353.0,21.0,4.8,6.9,6.0,3.0,7.0
75%,529.0,22.0,5.8,7.7,7.0,4.0,8.0
max,705.0,24.0,8.5,9.6,9.0,5.0,9.0


In [None]:
# Drop Student_ID from table. Although it is numeric it is only an identifier
numeric = ['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Conflicts_Over_Social_Media', 'Addicted_Score']

In [23]:
df[numeric].describe()

Unnamed: 0,Age,Avg_Daily_Usage_Hours,Sleep_Hours_Per_Night,Mental_Health_Score,Conflicts_Over_Social_Media,Addicted_Score
count,705.0,705.0,705.0,705.0,705.0,705.0
mean,20.659574,4.918723,6.868936,6.22695,2.849645,6.436879
std,1.399217,1.257395,1.126848,1.105055,0.957968,1.587165
min,18.0,1.5,3.8,4.0,0.0,2.0
25%,19.0,4.1,6.0,5.0,2.0,5.0
50%,21.0,4.8,6.9,6.0,3.0,7.0
75%,22.0,5.8,7.7,7.0,4.0,8.0
max,24.0,8.5,9.6,9.0,5.0,9.0
