In [57]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jayaantanaath/student-habits-vs-academic-performance")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/student-habits-vs-academic-performance


In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import statsmodels

In [59]:
df = pd.read_csv(path + "/student_habits_performance.csv")

In [60]:
for column in df.select_dtypes(object):
  if column == 'student_id':
    continue
  print(f"{column} | {df[column].unique()}")

gender | ['Female' 'Male' 'Other']
part_time_job | ['No' 'Yes']
diet_quality | ['Fair' 'Good' 'Poor']
parental_education_level | ['Master' 'High School' 'Bachelor' nan]
internet_quality | ['Average' 'Poor' 'Good']
extracurricular_participation | ['Yes' 'No']


In [61]:
df['part_time_job'] = df['part_time_job'].map({
    'No': 0,
    'Yes': 1
})

df['diet_quality'] = df['diet_quality'].map({
    'Poor': 0,
    'Fair': 1,
    'Good': 2
})

df['internet_quality'] = df['internet_quality'].map({
    'Poor': 0,
    'Average': 1,
    'Good': 2
})

df['extracurricular_participation'] = df['extracurricular_participation'].map({
    'No': 0,
    'Yes': 1,
})

df['parental_education_level'] = df['parental_education_level'].map({
    np.nan: 0,
    'High School': 1,
    'Bachelor': 2,
    'Master': 3,
})

In [62]:
df = df.drop('student_id', axis=1)

In [63]:
features = [
  'gender',
  'study_hours_per_day',
  'social_media_hours',
  'netflix_hours',
  'part_time_job',
  'attendance_percentage',
  'sleep_hours',
  'diet_quality',
  'exercise_frequency',
  'parental_education_level',
  'internet_quality',
  'mental_health_rating',
  'extracurricular_participation',
]

In [64]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

categorical_features = ['gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'  # Keep other columns (like 'Size') as-is
)

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X = df[features]
y = df['exam_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [66]:
from sklearn.metrics import r2_score, mean_squared_error

y_true = y_test
y_pred = model.predict(X_test)

r2_score(y_true, y_pred), mean_squared_error(y_true, y_pred)

(0.8756548101605922, 30.555494081262577)