In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import pickle

In [2]:
# Load your dataset
data = pd.read_csv('student_data_with_gpa.csv')

In [3]:
# Preprocess the Data
X = data.drop('GPA', axis=1)
y = data['GPA']

In [4]:
# Define preprocessing pipeline
numeric_features = ['Age', 'CognitiveScore', 'LearningStrategyScore']
categorical_features = ['Gender', 'Discipline']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [5]:
# Preprocess data
X_preprocessed = preprocessor.fit_transform(X)

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [24]:
model = RandomForestRegressor(
    n_estimators=410,         # Reduced number of trees
    max_depth=1,             # Limit the depth of the trees
         # Increase the minimum number of samples per lea
)
model.fit(X_train, y_train)

In [25]:
# Evaluate the Model
mae = np.mean(np.abs(model.predict(X_test) - y_test))
print(f'Test MAE: {mae:.2f}')

Test MAE: 0.50


In [9]:
# Save the model and preprocessor for use in the Streamlit app
with open('student_gpa_model_rf.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)