## Get dataset

In [1]:
import os
import sys
import pandas as pd
import torch

# Get the project root
project_root = os.path.abspath("..")    # Go up one level from "notebooks/"
sys.path.append(project_root)           # Add the root to the path

from src.models.train_model import RegressionModel, RMSELoss

In [2]:
dataset_path = os.path.join(os.getcwd(), '../', 'data', 'processed', 'processed_data_cleaned.csv')
df = pd.read_csv(dataset_path)

## Main

In [3]:
selected_features = ['escuela', 'materias_reprobadas', 'apoyo_familiar', 'apoyos_economicos', 'ejercer_carrera']
X = df[selected_features]
y = df['promedio_global']

X.describe()

Unnamed: 0,escuela,materias_reprobadas,apoyo_familiar,apoyos_economicos,ejercer_carrera
count,1770.0,1770.0,1770.0,1770.0,1770.0
mean,3.213559,1.695537,4.250847,0.688701,1.847458
std,2.362233,1.817031,1.046306,0.463156,0.413736
min,0.0,0.0,1.0,0.0,0.0
25%,1.0,0.0,4.0,0.0,2.0
50%,3.0,1.0,5.0,1.0,2.0
75%,5.0,3.0,5.0,1.0,2.0
max,8.0,7.0,5.0,1.0,2.0


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Transform to tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

X_val = torch.tensor(X_val.values, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)

X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [5]:
input_size = X_train.shape[1]  # Number of features
model = RegressionModel(input_size)

In [6]:
from torch import optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = RMSELoss()

In [7]:
epochs = 3000

best_model = 0
best_model_path = os.path.join(os.getcwd(), '../', 'src', 'models', 'avg_without_outliers.pth')

for epoch in range(epochs):
    model.train()
    outputs = model(X_train)
    train_loss = criterion(outputs, y_train)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)
        if epoch == 1:
            best_model = val_loss.item()
        else:
            if val_loss.item() < best_model:
                torch.save(model.state_dict(), best_model_path)
                best_model = val_loss.item()
                print(f"[+] Model saved in epoch {epoch} with loss: {best_model:.5f}")


[+] Model saved in epoch 2 with loss: 8.14501
[+] Model saved in epoch 3 with loss: 8.10865
[+] Model saved in epoch 4 with loss: 8.07309
[+] Model saved in epoch 5 with loss: 8.03752
[+] Model saved in epoch 6 with loss: 8.00211
[+] Model saved in epoch 7 with loss: 7.96725
[+] Model saved in epoch 8 with loss: 7.93266
[+] Model saved in epoch 9 with loss: 7.89879
[+] Model saved in epoch 10 with loss: 7.86602
[+] Model saved in epoch 11 with loss: 7.83427
[+] Model saved in epoch 12 with loss: 7.80317
[+] Model saved in epoch 13 with loss: 7.77214
[+] Model saved in epoch 14 with loss: 7.74140
[+] Model saved in epoch 15 with loss: 7.71016
[+] Model saved in epoch 16 with loss: 7.67797
[+] Model saved in epoch 17 with loss: 7.64462
[+] Model saved in epoch 18 with loss: 7.60988
[+] Model saved in epoch 19 with loss: 7.57378
[+] Model saved in epoch 20 with loss: 7.53630
[+] Model saved in epoch 21 with loss: 7.49744
[+] Model saved in epoch 22 with loss: 7.45704
[+] Model saved in ep

## Export test data

In [8]:
X_test_df = pd.DataFrame(X_test.numpy(), columns=selected_features)
y_test_df = pd.DataFrame(y_test.numpy(), columns=['promedio_global'])

X_test_path = os.path.join(os.getcwd(), '../', 'data', 'processed', 'X_test_wihtout_outliers.csv')
y_test_path = os.path.join(os.getcwd(), '../', 'data', 'processed', 'y_test_wihtout_outliers.csv')

X_test_df.to_csv(X_test_path, index=False)
y_test_df.to_csv(y_test_path, index=False)