In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', None)

# dataset

In [104]:
data = pd.read_csv("/content/Student_performance_data _.csv")

In [105]:
df = data.copy()

In [106]:
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [108]:
df["StudyTimeWeekly"] = df["StudyTimeWeekly"].astype(int)

In [109]:
df["GPA"] = df["GPA"].astype(int)

In [110]:
df["GradeClass"] = df["GradeClass"].astype(int)

In [111]:
df.drop("StudentID",axis=1,inplace=True)

In [112]:
df.head()

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,17,1,0,2,19,7,1,2,0,0,1,0,2,2
1,18,0,0,1,15,0,0,1,0,0,0,0,3,1
2,15,0,2,3,4,26,0,2,0,0,0,0,0,4
3,17,1,0,3,10,14,0,3,1,0,0,0,2,3
4,17,1,0,2,4,17,1,3,0,0,0,0,1,4


In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Age                2392 non-null   int64
 1   Gender             2392 non-null   int64
 2   Ethnicity          2392 non-null   int64
 3   ParentalEducation  2392 non-null   int64
 4   StudyTimeWeekly    2392 non-null   int64
 5   Absences           2392 non-null   int64
 6   Tutoring           2392 non-null   int64
 7   ParentalSupport    2392 non-null   int64
 8   Extracurricular    2392 non-null   int64
 9   Sports             2392 non-null   int64
 10  Music              2392 non-null   int64
 11  Volunteering       2392 non-null   int64
 12  GPA                2392 non-null   int64
 13  GradeClass         2392 non-null   int64
dtypes: int64(14)
memory usage: 261.8 KB


In [114]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Ethnicity,0
ParentalEducation,0
StudyTimeWeekly,0
Absences,0
Tutoring,0
ParentalSupport,0
Extracurricular,0
Sports,0


In [115]:
df["GradeClass"].value_counts()

Unnamed: 0_level_0,count
GradeClass,Unnamed: 1_level_1
4,1211
3,414
2,391
1,269
0,107


# independent and dependent features

In [116]:
# independent features
x = df.drop("GradeClass",axis=1)
# dependent feature
y = df['GradeClass']

# train test split

In [117]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [118]:
y_train.value_counts()

Unnamed: 0_level_0,count
GradeClass,Unnamed: 1_level_1
4,974
3,328
2,306
1,220
0,85


# balancing the dataset

In [119]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

In [120]:
y_resampled.value_counts()

Unnamed: 0_level_0,count
GradeClass,Unnamed: 1_level_1
4,974
1,974
2,974
3,974
0,974


# feature selection

In [121]:
info_gain = mutual_info_classif(x_resampled,y_resampled)

In [122]:
ranked_features = np.argsort(info_gain)[::-1]

In [123]:
ranked_features

array([12,  5,  7,  0,  6,  1,  4,  3, 11,  9,  8, 10,  2])

In [128]:
N = 8
top_features = ranked_features[:N]
x_train_selected = x_resampled.iloc[:, top_features.tolist()]
x_test_selected = x_test.iloc[:, top_features.tolist()]

In [129]:
x_train_selected

Unnamed: 0,GPA,Absences,ParentalSupport,Age,Tutoring,Gender,StudyTimeWeekly,ParentalEducation
0,1,24,0,18,1,1,18,4
1,0,19,1,16,0,1,2,2
2,0,25,0,17,0,0,3,3
3,1,25,2,17,1,1,15,1
4,1,19,2,16,1,0,1,0
...,...,...,...,...,...,...,...,...
4865,2,13,1,15,1,1,10,1
4866,2,14,1,16,0,0,16,2
4867,2,20,3,17,1,0,17,1
4868,2,17,3,16,0,0,9,0


In [130]:
x_test_selected

Unnamed: 0,GPA,Absences,ParentalSupport,Age,Tutoring,Gender,StudyTimeWeekly,ParentalEducation
1004,1,17,2,15,0,1,0,4
196,3,0,1,16,0,1,1,4
2342,2,15,3,15,0,0,10,0
1708,3,1,3,18,0,1,16,2
435,0,27,1,18,0,0,3,0
...,...,...,...,...,...,...,...,...
986,1,16,3,18,0,1,8,2
120,2,12,1,18,1,1,3,3
283,2,5,1,17,1,0,5,1
1740,2,10,3,18,0,1,10,0


# model training

In [131]:
clasif = GradientBoostingClassifier()

In [132]:
clasif.fit(x_train_selected,y_resampled)

In [133]:
pred = clasif.predict(x_test_selected)

# model evaluation

In [134]:
print("accuracy:-",accuracy_score(y_test,pred))


accuracy:- 0.8058455114822547
