In [19]:
import pandas as pd

In [20]:
df =  pd.read_csv('student.csv')

In [21]:
df.head()

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,1.0,John,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,2.0,Sarah,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True
2,3.0,Alex,Male,78.0,10.0,65.0,0.0,Low,68.0,4.6,92.0,False
3,4.0,Michael,Male,92.0,25.0,90.0,3.0,High,92.0,2.9,96.0,False
4,5.0,Emma,Female,,18.0,82.0,2.0,Medium,85.0,4.1,97.0,True


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  960 non-null    float64
 1   Name                       966 non-null    object 
 2   Gender                     952 non-null    object 
 3   AttendanceRate             960 non-null    float64
 4   StudyHoursPerWeek          950 non-null    float64
 5   PreviousGrade              967 non-null    float64
 6   ExtracurricularActivities  957 non-null    float64
 7   ParentalSupport            978 non-null    object 
 8   FinalGrade                 960 non-null    float64
 9   Study Hours                976 non-null    float64
 10  Attendance (%)             959 non-null    float64
 11  Online Classes Taken       975 non-null    object 
dtypes: float64(8), object(4)
memory usage: 93.9+ KB


In [23]:
df.isnull().sum()

StudentID                    40
Name                         34
Gender                       48
AttendanceRate               40
StudyHoursPerWeek            50
PreviousGrade                33
ExtracurricularActivities    43
ParentalSupport              22
FinalGrade                   40
Study Hours                  24
Attendance (%)               41
Online Classes Taken         25
dtype: int64

In [24]:
df = df.drop(columns = ['Name','StudentID'])

In [25]:
num_cols = df.select_dtypes(include='number').columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())


In [26]:
nan_idx = df[df['Gender'].isnull()].index
alt = ['Male', 'Female']*(len(nan_idx)//2+1)
alt = alt[:len(nan_idx)]
df.loc[nan_idx, 'Gender'] = alt

In [27]:
nan_idx = df[df['Online Classes Taken'].isnull()].index
empty = [True, False]*(len(nan_idx)//2+1)
empty = empty[:len(nan_idx)]
df.loc[nan_idx,'Online Classes Taken'] = empty

In [28]:
nan_idx = df[df['ParentalSupport'].isnull()].index
empty = ['High', 'Medium', 'Low'] * (len(nan_idx)//3+1)
empty = empty[:len(nan_idx)]
df.loc[nan_idx,'ParentalSupport'] = empty

In [29]:
df['Online Classes Taken'] = df['Online Classes Taken'].astype(int)

In [30]:
df.isnull().sum()

Gender                       0
AttendanceRate               0
StudyHoursPerWeek            0
PreviousGrade                0
ExtracurricularActivities    0
ParentalSupport              0
FinalGrade                   0
Study Hours                  0
Attendance (%)               0
Online Classes Taken         0
dtype: int64

In [31]:
df.describe()

Unnamed: 0,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,85.61,17.649,77.612,1.498,80.029,2.4092,77.197,0.508
std,7.200399,6.113688,9.840238,1.029104,9.301649,1.600749,18.899613,0.500186
min,70.0,8.0,60.0,0.0,62.0,-5.0,50.0,0.0
25%,82.0,12.0,70.0,1.0,72.0,1.2,64.0,0.0
50%,88.0,18.0,78.0,1.0,80.0,2.5,76.0,1.0
75%,91.0,22.0,86.0,2.0,88.0,3.7,89.0,1.0
max,95.0,30.0,90.0,3.0,92.0,5.0,200.0,1.0


# ML part

In [32]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

MODEL_FILE = "model.pkl"
PIPEPLINE_FILE = "pipeline.pkl"

In [33]:
def build_pipeline(num_attribs, cat_attribs):
      
     num_pipeline = Pipeline([
          ('scaler', StandardScaler())
        
     ])

     cat_pipeline = Pipeline([
          ('ordinal', OrdinalEncoder())
     ])

     full_pipeline = ColumnTransformer([
          ('num', num_pipeline, num_attribs),
          ('cat', cat_pipeline, cat_attribs)
     ])

     return full_pipeline

In [37]:
if not os.path.exists(MODEL_FILE):
    student = df

    split =  StratifiedShuffleSplit(n_splits=1, test_size= 0.2, random_state=42)
    for train_index, test_index in split.split(student, student['ParentalSupport']):
        student.loc[test_index].to_csv('input.csv', index = False)
        train_sec = student.loc[train_index]

    train_sec_labels = student['FinalGrade'].copy()
    train_sec_feature = student.drop('FinalGrade', axis = 1)

    num_attribs = train_sec_feature.drop(['Gender', 'ParentalSupport'], axis = 1).columns.tolist()
    cat_attribs = ['Gender','ParentalSupport']

    pipeline = build_pipeline(num_attribs, cat_attribs)
    student_prepared = pipeline.fit_transform(train_sec_feature)

    model = LinearRegression()
    model.fit(student_prepared, train_sec_labels)

    joblib.dump(model, MODEL_FILE)
    joblib.dump(pipeline, PIPEPLINE_FILE)

    print("model is trained, cong")
    
else:
    model = joblib.load(MODEL_FILE)
    pipeline = joblib.load(PIPEPLINE_FILE)
    input_data = pd.read_csv('input.csv')
    tranformed_data = pipeline.transform(input_data)
    predications = model.predict(tranformed_data)
    input_data['finalgrade'] = predications
    input_data.to_csv("output.csv", index = False)
    print("Innference is complete, results saved to output.csv Enjoy")

    



Innference is complete, results saved to output.csv Enjoy


In [38]:
input_data = pd.read_csv('input.csv')
true_values = input_data['FinalGrade']

mae = mean_absolute_error(true_values, predications)
mse = mean_squared_error(true_values, predications)

print("Inference MAE:", mae)
print("Inference MSE:", mse)

Inference MAE: 8.117725725202732
Inference MSE: 90.804513134878
