In [3]:
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X_train = train_data.drop(columns=['GPA', 'StudentID'])
y_train = train_data['GPA']

X_test = test_data.drop(columns=['GPA', 'StudentID'])
y_test = test_data['GPA']


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numerical_cols = ['Age', 'StudyTimeWeekly', 'Absences']
categorical_cols = ['Gender', 'Ethnicity', 'ParentalEducation', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

model = tf.keras.Sequential([
    layers.InputLayer(input_shape=(X_train_final.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
history = model.fit(X_train_final, y_train_final, epochs=50, validation_data=(X_val_final, y_val_final))

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_test_processed)
r2 = r2_score(y_test, y_pred)
print(f'R2 score on test set: {r2}')