In [0]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [0]:
df = pd.read_csv('/Volumes/codtechitsolutionscatalog/codtechitsolutionsschema/codtechitsolutionsvolume/student-performance.csv')

In [0]:
display(df)


school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10
GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,2,5,10,15,15,15
GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,12,12,11
GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,0,yes,yes,no,no,yes,yes,no,no,4,1,4,1,1,1,6,6,5,6
GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0,16,18,19
GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,5,1,1,1,5,0,14,15,15


In [0]:
target = "G3"

X = df.drop(columns=[target])
y = df[target]


In [0]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerical columns:", numeric_features)
print("Categorical columns:", categorical_features)


Numerical columns: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2']
Categorical columns: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [0]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [0]:
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(
        drop="if_binary",
        handle_unknown="ignore",
        sparse_output=False
    ))
])


In [0]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)


In [0]:
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor)
])


In [0]:
X_processed = pipeline.fit_transform(X)

print("Processed data shape:", X_processed.shape)


Processed data shape: (395, 45)


In [0]:
feature_names = pipeline.named_steps["preprocessing"].get_feature_names_out()
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

X_processed_df.head()


Unnamed: 0,num__age,num__Medu,num__Fedu,num__traveltime,num__studytime,num__failures,num__famrel,num__freetime,num__goout,num__Dalc,num__Walc,num__health,num__absences,num__G1,num__G2,cat__school_MS,cat__sex_M,cat__address_U,cat__famsize_LE3,cat__Pstatus_T,cat__Mjob_at_home,cat__Mjob_health,cat__Mjob_other,cat__Mjob_services,cat__Mjob_teacher,cat__Fjob_at_home,cat__Fjob_health,cat__Fjob_other,cat__Fjob_services,cat__Fjob_teacher,cat__reason_course,cat__reason_home,cat__reason_other,cat__reason_reputation,cat__guardian_father,cat__guardian_mother,cat__guardian_other,cat__schoolsup_yes,cat__famsup_yes,cat__paid_yes,cat__activities_yes,cat__nursery_yes,cat__higher_yes,cat__internet_yes,cat__romantic_yes
0,1.023046,1.143856,1.360371,0.792251,-0.042286,-0.449944,0.062194,-0.23601,0.801479,-0.540699,-1.003789,-0.399289,0.036424,-1.782467,-1.254791,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.23838,-1.600009,-1.39997,-0.643249,-0.042286,-0.449944,1.17886,-0.23601,-0.097908,-0.540699,-1.003789,-0.399289,-0.213796,-1.782467,-1.520979,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-1.330954,-1.600009,-1.39997,-0.643249,-0.042286,3.589323,0.062194,-0.23601,-0.997295,0.583385,0.5511,-0.399289,0.536865,-1.179147,-0.722415,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
3,-1.330954,1.143856,-0.479857,-0.643249,1.150779,-0.449944,-1.054472,-1.238419,-0.997295,-0.540699,-1.003789,1.04107,-0.464016,1.234133,0.874715,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,-0.546287,0.229234,0.440257,-0.643249,-0.042286,-0.449944,0.062194,-0.23601,-0.997295,-0.540699,-0.226345,1.04107,-0.213796,-1.480807,-0.190038,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0


In [0]:
display(X_processed_df)

num__age,num__Medu,num__Fedu,num__traveltime,num__studytime,num__failures,num__famrel,num__freetime,num__goout,num__Dalc,num__Walc,num__health,num__absences,num__G1,num__G2,cat__school_MS,cat__sex_M,cat__address_U,cat__famsize_LE3,cat__Pstatus_T,cat__Mjob_at_home,cat__Mjob_health,cat__Mjob_other,cat__Mjob_services,cat__Mjob_teacher,cat__Fjob_at_home,cat__Fjob_health,cat__Fjob_other,cat__Fjob_services,cat__Fjob_teacher,cat__reason_course,cat__reason_home,cat__reason_other,cat__reason_reputation,cat__guardian_father,cat__guardian_mother,cat__guardian_other,cat__schoolsup_yes,cat__famsup_yes,cat__paid_yes,cat__activities_yes,cat__nursery_yes,cat__higher_yes,cat__internet_yes,cat__romantic_yes
1.023046454461126,1.1438556741642336,1.3603706359757963,0.7922507560739607,-0.0422858458810214,-0.4499436443692835,0.062194058833928,-0.2360102044624822,0.8014792996325567,-0.5406986566036541,-1.0037892104979562,-0.3992894854413955,0.0364244642084466,-1.7824668774646413,-1.254791054106298,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
0.2383797563792924,-1.6000086454402538,-1.3999704661326269,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,1.178860115170367,-0.2360102044624822,-0.0979079826255681,-0.5406986566036541,-1.0037892104979562,-0.3992894854413955,-0.2137957681800131,-1.7824668774646413,-1.520979274499417,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
-1.3309536397843746,-1.6000086454402538,-1.3999704661326269,-0.6432494670875735,-0.0422858458810214,3.5893231630367843,0.062194058833928,-0.2360102044624822,-0.997295264883693,0.5833853926513113,0.55109995870476,-0.3992894854413955,0.5368649289853663,-1.179146897517312,-0.7224146133200599,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
-1.3309536397843746,1.1438556741642336,-0.4798567654298188,-0.6432494670875735,1.1507790914763645,-0.4499436443692835,-1.0544719975025107,-1.238419137394531,-0.997295264883693,-0.5406986566036541,-1.0037892104979562,1.0410698456029082,-0.464016000568473,1.2341330222720055,0.8747147090386544,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-0.5462869417025411,0.2292342342960711,0.4402569352729889,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,0.062194058833928,-0.2360102044624822,-0.997295264883693,-0.5406986566036541,-0.2263446258965981,1.0410698456029082,-0.2137957681800131,-1.4808068874909766,-0.1900381725338217,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
-0.5462869417025411,1.1438556741642336,0.4402569352729889,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,1.178860115170367,0.7663987284695665,-0.997295264883693,-0.5406986566036541,-0.2263446258965981,1.0410698456029082,0.5368649289853663,1.2341330222720055,1.1409029294317736,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
-0.5462869417025411,-0.6853872055720913,-0.4798567654298188,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,0.062194058833928,0.7663987284695665,0.8014792996325567,-0.5406986566036541,-1.0037892104979562,-0.3992894854413955,-0.7142362329569328,0.3291530523510115,0.3423382682524163,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
0.2383797563792924,1.1438556741642336,1.3603706359757963,0.7922507560739607,-0.0422858458810214,-0.4499436443692835,0.062194058833928,-2.24082807032658,0.8014792996325567,-0.5406986566036541,-1.0037892104979562,-1.8396488164856992,0.0364244642084466,-1.4808068874909766,-1.520979274499417,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
-1.3309536397843746,0.2292342342960711,-0.4798567654298188,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,0.062194058833928,-1.238419137394531,-0.997295264883693,-0.5406986566036541,-1.0037892104979562,-1.8396488164856992,-0.7142362329569328,1.5357930122456702,1.9394675906111307,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
-1.3309536397843746,0.2292342342960711,1.3603706359757963,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,1.178860115170367,1.7688076614016153,-1.896682547141818,-0.5406986566036541,-1.0037892104979562,1.0410698456029082,-0.7142362329569328,0.9324730322983408,1.1409029294317736,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [0]:
import joblib

joblib.dump(pipeline, "student_preprocessing_pipeline.pkl")


['student_preprocessing_pipeline.pkl']

In [0]:
df2 = pd.read_csv('/Volumes/codtechitsolutionscatalog/codtechitsolutionsschema/codtechitsolutionsvolume/student-performance.csv')

X2 = df2.drop(columns=["G3"])


In [0]:
display(df2)

school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10
GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,4,2,1,2,5,10,15,15,15
GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,12,12,11
GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,0,yes,yes,no,no,yes,yes,no,no,4,1,4,1,1,1,6,6,5,6
GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0,16,18,19
GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,0,no,yes,yes,yes,yes,yes,yes,no,5,5,1,1,1,5,0,14,15,15


In [0]:
loaded_pipeline = joblib.load("student_preprocessing_pipeline.pkl")

X2_processed = loaded_pipeline.transform(X2)


In [0]:
feature_names = loaded_pipeline.named_steps["preprocessing"].get_feature_names_out()
X2_processed_df = pd.DataFrame(X2_processed, columns=feature_names)

display(X2_processed_df.head())


num__age,num__Medu,num__Fedu,num__traveltime,num__studytime,num__failures,num__famrel,num__freetime,num__goout,num__Dalc,num__Walc,num__health,num__absences,num__G1,num__G2,cat__school_MS,cat__sex_M,cat__address_U,cat__famsize_LE3,cat__Pstatus_T,cat__Mjob_at_home,cat__Mjob_health,cat__Mjob_other,cat__Mjob_services,cat__Mjob_teacher,cat__Fjob_at_home,cat__Fjob_health,cat__Fjob_other,cat__Fjob_services,cat__Fjob_teacher,cat__reason_course,cat__reason_home,cat__reason_other,cat__reason_reputation,cat__guardian_father,cat__guardian_mother,cat__guardian_other,cat__schoolsup_yes,cat__famsup_yes,cat__paid_yes,cat__activities_yes,cat__nursery_yes,cat__higher_yes,cat__internet_yes,cat__romantic_yes
1.023046454461126,1.1438556741642336,1.3603706359757963,0.7922507560739607,-0.0422858458810214,-0.4499436443692835,0.062194058833928,-0.2360102044624822,0.8014792996325567,-0.5406986566036541,-1.0037892104979562,-0.3992894854413955,0.0364244642084466,-1.7824668774646413,-1.254791054106298,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
0.2383797563792924,-1.6000086454402538,-1.3999704661326269,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,1.178860115170367,-0.2360102044624822,-0.0979079826255681,-0.5406986566036541,-1.0037892104979562,-0.3992894854413955,-0.2137957681800131,-1.7824668774646413,-1.520979274499417,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
-1.3309536397843746,-1.6000086454402538,-1.3999704661326269,-0.6432494670875735,-0.0422858458810214,3.5893231630367843,0.062194058833928,-0.2360102044624822,-0.997295264883693,0.5833853926513113,0.55109995870476,-0.3992894854413955,0.5368649289853663,-1.179146897517312,-0.7224146133200599,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
-1.3309536397843746,1.1438556741642336,-0.4798567654298188,-0.6432494670875735,1.1507790914763645,-0.4499436443692835,-1.0544719975025107,-1.238419137394531,-0.997295264883693,-0.5406986566036541,-1.0037892104979562,1.0410698456029082,-0.464016000568473,1.2341330222720055,0.8747147090386544,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-0.5462869417025411,0.2292342342960711,0.4402569352729889,-0.6432494670875735,-0.0422858458810214,-0.4499436443692835,0.062194058833928,-0.2360102044624822,-0.997295264883693,-0.5406986566036541,-0.2263446258965981,1.0410698456029082,-0.2137957681800131,-1.4808068874909766,-0.1900381725338217,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0


## Task-1: Data Preprocessing Pipeline

In this task, a fully automated preprocessing pipeline was developed using 
Scikit-learn. The pipeline handles missing values, categorical encoding, and 
feature scaling using ColumnTransformer and Pipeline. This ensures consistent 
data transformations across training and future unseen data.
