In [1]:
import altair as alt
import pandas as pd

student_performance = pd.read_csv("../data/raw/StudentPerformanceFactors.csv")

jane_df = student_performance[student_performance["Parental_Involvement"] == "High"].copy()

# Create a categorical column for sleep to make the comparison (8+ vs <6) explicit
jane_df["sleep_category"] = pd.cut(
    jane_df["Sleep_Hours"], 
    bins=[-float("inf"), 6, 8, float("inf")],
    labels=["< 6 hours", "6-8 hours", "> 8 hours"]
)

In [2]:
# Base chart
base = alt.Chart(jane_df).encode(
    x=alt.X('Sleep_Hours', 
            title='Hours of Sleep',
            scale=alt.Scale(domain=[3, 10])),
    y=alt.Y('Exam_Score', 
            title='Exam Score',
            scale=alt.Scale(domain=[50,100])),
    color=alt.Color('Access_to_Resources', title='Resource Access')
)

# Scatter points
points = base.mark_circle(size=60, opacity=0.6)

# Trend lines
lines = base.transform_regression(
    'Sleep_Hours', 'Exam_Score', groupby=['Access_to_Resources']
).mark_line(size=3)

chart1 = (points + lines).properties(
    title='Sleep Duration vs. Exam Score (High Parental Involvement)',
    width=400,
    height=300
)

chart1

In [3]:
chart2 = alt.Chart(jane_df).mark_bar().encode(
    x=alt.X('sleep_category', title='Sleep Category', sort=['< 6 hours', '6-8 hours', '>8 hours']),
    y=alt.Y('mean(Exam_Score)', title='Average Exam Score'),
    color=alt.Color('sleep_category', legend=None), # Color by sleep for visual pop
    column=alt.Column('Access_to_Resources', 
                      title='Access to Resources',
                      sort=["Low","Medium","High"],
                      header=alt.Header(titleOrient="top"))
).properties(
    title='Average Scores by Sleep Duration across Resource Levels',
    width=150,
    height=300
)

chart2