In [1]:
#########################################################################
#Title  : Preprocessing and Machine Learning 
#Author : Kevin Ryan Noronha

#Editors: Please Enter your Name/Date and description of the edit below 




##########################################################################

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
#import xgboost as xgb

from sklearn.preprocessing    import StandardScaler
from sklearn.model_selection  import train_test_split
from sklearn.linear_model     import LogisticRegression, LinearRegression
from sklearn.cluster          import KMeans
from sklearn.metrics          import accuracy_score, r2_score

# --- 1. LOAD & BASIC CLEANING ---
file_path = 'student_records.xlsx'  # or 'student_records.xlsx'
df = pd.read_excel(file_path)

# keep only the essentials
cols = ['Emplid', 'Name', 'Program Status',
        'Course', 'Mark', 'Unit Value', 'Credits']
df = df[cols].drop_duplicates()

# enforce dtypes
df['Emplid']      = df['Emplid'].astype(str)
df['Mark']        = pd.to_numeric(df['Mark'],       errors='coerce')
df['Unit Value']  = pd.to_numeric(df['Unit Value'], errors='coerce')
df['Credits']     = pd.to_numeric(df['Credits'],    errors='coerce')


# --- 2. PIVOT TO ONE ROW PER STUDENT ---
pivot = (
    df
    .pivot_table(
        index=['Emplid','Name','Program Status'],
        columns='Course',
        values='Mark',
        aggfunc='first'      # or np.mean if duplicates
    )
    .reset_index()
)

# compute totals
totals = (
    df
    .groupby(['Emplid','Name'])
    [['Unit Value','Credits']]
    .sum()
    .rename(columns={'Unit Value':'TotalUnits','Credits':'TotalCredits'})
    .reset_index()
)

data = pivot.merge(totals, on=['Emplid','Name'])

In [3]:
# Identify subject columns
subject_cols = [
    c for c in data.columns
    if c not in ['Emplid', 'Name', 'Program Status', 'TotalUnits', 'TotalCredits']
]

# 1. Compute GPA before filling NaNs
data['GPA'] = data[subject_cols].sum(axis=1, skipna=True)/((data[subject_cols].notna().sum(axis=1)) * 25)

In [4]:
print(data.head())

  Emplid       Name Program Status  COMM2301  COMM2583  COMM2585  COMM2587  \
0   1001  Student 1        Pending      62.0       NaN      50.0       9.0   
1   1002  Student 2      Completed       NaN       NaN      82.0      76.0   
2   1003  Student 3      Completed      52.0      67.0       NaN      64.0   
3   1004  Student 4      Completed      53.0      60.0       NaN      73.0   
4   1005  Student 5        Pending      62.0      55.0       NaN      79.0   

   COMM2589  COMM2591  COMM2595  COMM2598  COMM2745  COMM2747  COMM2751  \
0      53.0       NaN      69.0       NaN      71.0      67.0      70.0   
1      60.0      78.0       NaN       NaN      52.0      71.0       NaN   
2      79.0      50.0       NaN       NaN      66.0      55.0       NaN   
3      64.0      67.0       NaN      67.0       NaN      54.0       NaN   
4      50.0      63.0       NaN       NaN       NaN      61.0      29.0   

   COMM2753  COMM2755  OART1013  TotalUnits  TotalCredits       GPA  
0       Na

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Emplid          3000 non-null   object 
 1   Name            3000 non-null   object 
 2   Program Status  3000 non-null   object 
 3   COMM2301        1305 non-null   float64
 4   COMM2583        1499 non-null   float64
 5   COMM2585        1501 non-null   float64
 6   COMM2587        3000 non-null   float64
 7   COMM2589        3000 non-null   float64
 8   COMM2591        2262 non-null   float64
 9   COMM2595        1673 non-null   float64
 10  COMM2598        1307 non-null   float64
 11  COMM2745        1353 non-null   float64
 12  COMM2747        2276 non-null   float64
 13  COMM2751        1672 non-null   float64
 14  COMM2753        1636 non-null   float64
 15  COMM2755        1318 non-null   float64
 16  OART1013        1660 non-null   float64
 17  TotalUnits      3000 non-null   i

In [6]:
# Identify subject columns
subject_cols = [
    c for c in data.columns
    if c not in ['Emplid', 'Name', 'Program Status', 'TotalUnits', 'TotalCredits', 'GPA']
]

# Apply custom normalization per subject
for col in subject_cols:
    # Compute μ using non-zero marks
    nonzero = data[col].replace(0, np.nan).dropna()
    mu = nonzero.mean()
    
    # Replace NaN (and zeros if desired) with μ
    data[col].fillna(mu, inplace=True)
    
    # Compute normalized value
    data[col] = (data[col] - mu) / mu

# View a sample
print(data.head())


  Emplid       Name Program Status  COMM2301  COMM2583  COMM2585  COMM2587  \
0   1001  Student 1        Pending -0.019035  0.000000 -0.209367 -0.857992   
1   1002  Student 2      Completed  0.000000  0.000000  0.296637  0.199177   
2   1003  Student 3      Completed -0.177255  0.049735  0.000000  0.009833   
3   1004  Student 4      Completed -0.161433 -0.059939  0.000000  0.151841   
4   1005  Student 5        Pending -0.019035 -0.138277  0.000000  0.246513   

   COMM2589  COMM2591  COMM2595  COMM2598  COMM2745  COMM2747  COMM2751  \
0 -0.160979  0.000000  0.092574  0.000000  0.126098  0.057068  0.111153   
1 -0.050165  0.234258  0.000000  0.000000 -0.175252  0.120177  0.000000   
2  0.250617 -0.208809  0.000000  0.000000  0.046795 -0.132257  0.000000   
3  0.013158  0.060196  0.000000  0.048569  0.000000 -0.148035  0.000000   
4 -0.208470 -0.003099  0.000000  0.000000  0.000000 -0.037595 -0.539665   

   COMM2753  COMM2755  OART1013  TotalUnits  TotalCredits       GPA  
0  0.00000

In [7]:
#Train Test Split
subject_cols = [
    c for c in data.columns
    if c not in ['Emplid', 'Name', 'Program Status', 'TotalUnits', 'TotalCredits', 'GPA']
]

X = data[subject_cols]
y = data['GPA']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [8]:
data[subject_cols]

Unnamed: 0,COMM2301,COMM2583,COMM2585,COMM2587,COMM2589,COMM2591,COMM2595,COMM2598,COMM2745,COMM2747,COMM2751,COMM2753,COMM2755,OART1013
0,-0.019035,0.000000,-0.209367,-0.857992,-0.160979,0.000000,0.092574,0.000000,0.126098,0.057068,0.111153,0.000000,0.000000,0.564558
1,0.000000,0.000000,0.296637,0.199177,-0.050165,0.234258,0.000000,0.000000,-0.175252,0.120177,0.000000,0.009183,0.000000,0.043038
2,-0.177255,0.049735,0.000000,0.009833,0.250617,-0.208809,0.000000,0.000000,0.046795,-0.132257,0.000000,0.000000,-0.076282,0.000000
3,-0.161433,-0.059939,0.000000,0.151841,0.013158,0.060196,0.000000,0.048569,0.000000,-0.148035,0.000000,0.000000,0.000000,0.580361
4,-0.019035,-0.138277,0.000000,0.246513,-0.208470,-0.003099,0.000000,0.000000,0.000000,-0.037595,-0.539665,0.000000,0.000000,0.216878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.000000,0.065402,0.000000,0.167620,0.076480,-0.208809,0.000000,0.000000,0.030934,0.104400,-0.047583,0.000000,-0.154564,0.000000
2996,-0.129789,0.000000,0.027822,0.041391,-0.034334,0.186787,0.000000,0.000000,-0.048368,0.000000,0.206395,0.088025,0.000000,-0.826160
2997,0.075897,0.000000,0.075260,0.072948,-0.208470,0.000000,-0.018267,0.000000,0.030934,-0.100703,0.000000,0.040720,0.000000,0.074646
2998,0.060075,-0.059939,0.000000,0.057169,0.171464,-0.034747,-0.018267,-0.217486,0.000000,0.000000,0.000000,-0.132733,0.000000,-0.083390


In [9]:
#RandomForestRegressor
reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X_train, y_train)
RF_pred = reg_model.predict(X_test)
print("Regression score:", r2_score(y_test, RF_pred))

Regression score: 0.7293867021456205


In [10]:
# --- 3. Export model to a .pkl file ---
joblib.dump({
    'model': reg_model,
    'subject_cols': subject_cols
}, 'gpa_reg_model.pkl')

print("Model and metadata saved to gpa_reg_model.pkl")

Model and metadata saved to gpa_reg_model.pkl


In [11]:
import cloudpickle
from sklearn.ensemble import HistGradientBoostingRegressor


hist_model = HistGradientBoostingRegressor(max_iter=100, random_state=42)
hist_model.fit(X_train, y_train)
pred_HGBR = hist_model.predict(X_test)
print("Regression score:", r2_score(y_test, pred_HGBR))

Regression score: 0.9121617009285024


In [12]:
# after `hist_model.fit(X, y)` and having `subject_cols`
with open('gpa_hist_model_cp.pkl', 'wb') as f:
    cloudpickle.dump({
        'model': hist_model,
        'subject_cols': subject_cols
    }, f)
print("Saved with cloudpickle to gpa_hist_model_cp.pkl")


Saved with cloudpickle to gpa_hist_model_cp.pkl


In [13]:
from sklearn.neighbors import KNeighborsRegressor


knn = KNeighborsRegressor(n_neighbors=5)  # choose k by cross‐validation
knn.fit(X_train, y_train)
pred_KNN = knn.predict(X_test)
print("Regression score:", r2_score(y_test, pred_KNN))


Regression score: 0.7854169828418933


In [14]:
joblib.dump({
    'model': knn,
    'subject_cols': subject_cols
}, 'gpa_knnreg_model.pkl')

print("Model and metadata saved to gpa_knnreg_model.pkl")

Model and metadata saved to gpa_knnreg_model.pkl
