# Librerías y datos del programa

In [63]:
# Python 3.12.10 - author: Carlos Brandon Cortes Cortina 
import kagglehub
import pandas as pd
import numpy as np 
import scipy.stats as stats
import os
import shutil
from sklearn.model_selection import train_test_split, KFold, GridSearchCV,cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from  xgboost import XGBRegressor

# Ajustes de la Data

In [64]:
# Descargar el dataset
pathd = kagglehub.dataset_download("jayaantanaath/student-habits-vs-academic-performance")
print("Path to dataset files:", pathd)

# Normalizar path
pathd = pathd.replace("\\", "/")

# Ruta destino
dest_folder = "C:/DB/Student-Habits-vs-Academic-Performance"
dest_file = os.path.join(dest_folder, "student_habits_performance.csv")

# Crear carpeta de destino si no existe
os.makedirs(dest_folder, exist_ok=True)

# Ruta del archivo descargado
source_file = os.path.join(pathd, "student_habits_performance.csv")

# Mover archivo si no existe ya en destino
if not os.path.exists(dest_file):
    shutil.move(source_file, dest_file)
    print("Archivo movido exitosamente.")
else:
    print("El archivo ya existe en la carpeta destino.")

Path to dataset files: C:\Users\brand\.cache\kagglehub\datasets\jayaantanaath\student-habits-vs-academic-performance\versions\1
El archivo ya existe en la carpeta destino.


In [65]:
data = pd.read_csv("C:/DB/Student-Habits-vs-Academic-Performance/student_habits_performance.csv")
data.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    object 
 12  internet_quality               1000 non-null   ob

# Selección de características e hiperparámetros

In [67]:
seed = 42
kf = KFold(n_splits = 10, shuffle = True, random_state = seed)
model = XGBRegressor(random_state = seed)

#Identificación de variables dependientes y la target
x_train = data.drop(columns=['exam_score','student_id'])
x_train = pd.get_dummies(x_train,drop_first = True, dtype = int)
y_train = data['exam_score']

#Partimos la data para predecir más adelante el 20% de los datos y comparar con los registros reales 
X_train,X_test,Y_train,Y_test = train_test_split(x_train,y_train,test_size = 0.2,random_state = seed)


In [68]:
#Selección de características
selector = RFECV(estimator=model, step=1, cv=kf, scoring = 'neg_mean_squared_error')
selector.fit(X_train,Y_train)
features = selector.get_feature_names_out()
print(f'Las variables elegidas son las siguientes:{features}')
x_train = x_train[features]

Las variables elegidas son las siguientes:['study_hours_per_day' 'social_media_hours' 'netflix_hours'
 'attendance_percentage' 'sleep_hours' 'exercise_frequency'
 'mental_health_rating' 'gender_Male' 'part_time_job_Yes'
 'parental_education_level_Master' 'internet_quality_Good'
 'internet_quality_Poor' 'extracurricular_participation_Yes']


In [69]:
# Una vez seleccionadas as características, ajustaremos el modelo con estas características para calcular los 
# hiperparámetros

search_space = {
    "n_estimators" : [100, 200, 500],
    "max_depth" : [3, 6, 9],
    "gamma" : [0.01, 0.1],
    "learning_rate" : [0.001, 0.01, 0.1, 1]
}

GS = GridSearchCV(estimator = model,
            param_grid = search_space,
            scoring = ["r2", "neg_root_mean_squared_error"], 
            refit = "r2",
            cv = kf,
            verbose = 4)
GS.fit(X_train,Y_train)

best_estimator = GS.best_estimator_
best_params = GS.best_params_
best_score = GS.best_score_
results = pd.DataFrame(GS.cv_results_)
results = results.sort_values('rank_test_r2')
results = results.to_csv()
print(f'Hiperparámetros:\n{best_params}\nbest_score:{best_score}')

Fitting 10 folds for each of 72 candidates, totalling 720 fits
[CV 1/10] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-16.043) r2: (test=0.130) total time=   0.0s
[CV 2/10] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-16.035) r2: (test=0.131) total time=   0.0s
[CV 3/10] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-15.130) r2: (test=0.133) total time=   0.0s
[CV 4/10] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-15.001) r2: (test=0.133) total time=   0.0s
[CV 5/10] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-17.675) r2: (test=0.118) total time=   0.0s
[CV 6/10] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-16.444) r2: (test=0.129) total ti

# Ajuste del modelo y análsis de las predicciones

In [70]:
# Ajustamos el modelo con los hiperparámetros y las característcias óptimas detectadas 
model_final = XGBRegressor(objective= 'reg:squarederror',gamma = 0.01, learning_rate = 0.1, max_depth = 3, n_estimators = 200,random_state = seed)

In [71]:
model_final.fit(X_train,Y_train)

In [72]:
scores = cross_val_score(estimator = model_final, X = X_train, y = Y_train, cv = kf)
np.mean(scores)

0.8790058959592757

In [73]:
predicts = model_final.predict(X_test)
mae = mean_absolute_error(Y_test,predicts)
rmse = np.sqrt(mean_squared_error(Y_test,predicts))
r2 = r2_score(Y_test,predicts)
print(f'mae:{mae}\nrmse:{rmse}\nr2:{r2}')

mae:4.5661974601745605
rmse:5.570508455392681
r2:0.8789896897986905


##### Conclusión: Observamos que las métricas de rendimiento reflejan que el modelo predice con bastante exactitud el desempeño de los alumnos basándase en los hábitos encontrados: 
##### ['study_hours_per_day' 'social_media_hours' 'netflix_hours''attendance_percentage' 'sleep_hours' 'exercise_frequency''mental_health_rating' 'gender_Male' 'part_time_job_Yes''parental_education_level_Master' 'internet_quality_Good''internet_quality_Poor' 'extracurricular_participation_Yes']