In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_PATH = Path("BF_ML_ready.csv")

# Load the prepared ML dataset
df = pd.read_csv(DATA_PATH)

print("Raw shape:", df.shape)
print("Columns:", list(df.columns))

required_cols = [
    "Age",
    "Class",
    "Sex_binary",
    "Has_phone",
    "Phone_total_months",
    "PhoneUse_Score",
    "Attention_Score",
    "Happiness_Score",
]

missing_required = [c for c in required_cols if c not in df.columns]
if missing_required:
    raise KeyError(f"Your BF_ML_ready.csv is missing these columns: {missing_required}")

# Drop any fully-duplicate rows just in case
df = df.drop_duplicates().reset_index(drop=True)

# Quick preview
df.head()


Raw shape: (672, 16)
Columns: ['ID', 'Age', 'Class', 'Sex_binary', 'Attention_Score', 'Happiness_Score', 'PhoneUse_Score', 'T13', 'T14', 'school every day', 'share phone family', 'electricity at home', 'parents rules phone', 'Phone purpose', 'Phone_total_months', 'Has_phone']


Unnamed: 0,ID,Age,Class,Sex_binary,Attention_Score,Happiness_Score,PhoneUse_Score,T13,T14,school every day,share phone family,electricity at home,parents rules phone,Phone purpose,Phone_total_months,Has_phone
0,1,15,10,1,2.266667,3.166667,4.583333,0.0,4.0,0,0,1,0,Communication,4.0,1.0
1,2,14,10,0,3.0,3.583333,4.25,1.0,0.0,0,0,0,0,"School, Games",12.0,1.0
2,3,14,10,1,1.333333,3.0,1.083333,0.0,0.0,0,0,0,0,Communication,0.0,0.0
3,4,15,10,1,2.0,2.916667,3.916667,1.0,0.0,0,1,0,0,School,12.0,1.0
4,5,16,10,0,2.066667,2.666667,3.833333,16.0,0.0,0,0,1,0,Communication,192.0,1.0


In [3]:
print("=== Descriptives ===")
print("N =", len(df))
print()

print("Age (years):")
print(df["Age"].describe(), "\n")

print("Sex_binary (1=Female,0=Male) counts:")
print(df["Sex_binary"].value_counts(dropna=False), "\n")

print("Has_phone (1=yes,0=no) counts:")
print(df["Has_phone"].value_counts(dropna=False), "\n")

print("Phone_total_months (exposure duration):")
print(df["Phone_total_months"].describe(), "\n")

print("Attention_Score (1-4, higher = better attention):")
print(df["Attention_Score"].describe(), "\n")

print("Happiness_Score (1-4, higher = more positive mood):")
print(df["Happiness_Score"].describe(), "\n")

print("PhoneUse_Score (1-7, higher = more/intense use):")
print(df["PhoneUse_Score"].describe(), "\n")

print("Unique classes (grades):", sorted(df["Class"].dropna().unique().tolist()))


=== Descriptives ===
N = 672

Age (years):
count    672.000000
mean      15.135417
std        2.467275
min       10.000000
25%       13.000000
50%       15.000000
75%       17.000000
max       23.000000
Name: Age, dtype: float64 

Sex_binary (1=Female,0=Male) counts:
Sex_binary
1    401
0    271
Name: count, dtype: int64 

Has_phone (1=yes,0=no) counts:
Has_phone
1.0    400
0.0    272
Name: count, dtype: int64 

Phone_total_months (exposure duration):
count    666.000000
mean      24.993994
std       42.552647
min        0.000000
25%        0.000000
50%        4.000000
75%       32.750000
max      192.000000
Name: Phone_total_months, dtype: float64 

Attention_Score (1-4, higher = better attention):
count    672.000000
mean       2.423710
std        0.521402
min        1.066667
25%        2.066667
50%        2.400000
75%        2.666667
max        4.000000
Name: Attention_Score, dtype: float64 

Happiness_Score (1-4, higher = more positive mood):
count    672.000000
mean       2.776662