# Data PreProcessing

### *Import Libraries

In [1]:
#Define data set
import numpy as np
import pandas as pd

#Define scalling data
from sklearn.preprocessing import StandardScaler,LabelEncoder

#define Data viz
import matplotlib.pyplot as plt
import seaborn as sns

#Define Model Built
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

### *Importing DATA

In [3]:
df = pd.read_csv('./Data/Student_performance_data _.csv')

### *EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe

In [None]:
df.dtypes

### *Handling Missing Values

In [None]:
df.isna().sum()

### *Feature Engineering

In [17]:
df['GradeClass'] = df['GradeClass'].astype('object')

In [19]:
df.dtypes

StudentID              int64
Age                    int64
Gender                 int64
Ethnicity              int64
ParentalEducation      int64
StudyTimeWeekly      float64
Absences               int64
Tutoring               int64
ParentalSupport        int64
Extracurricular        int64
Sports                 int64
Music                  int64
Volunteering           int64
GPA                  float64
GradeClass            object
dtype: object

In [23]:
le = LabelEncoder()
le.fit(df['GradeClass'])

In [25]:
df['GradeClass']=le.transform(df['GradeClass'])

In [27]:
df.head(1)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2


### *Handling Outliers

### *Data Vizulisation

# Model building

### *Train Test Split

In [63]:
x = df.drop(columns=['StudentID','GradeClass'])
y = df['GradeClass']

0    2
1    1
2    4
3    3
4    4
Name: GradeClass, dtype: int32

In [91]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [93]:
for i in [x_train,y_train,x_test,y_test]:
    print('-----'*25)
    print(i.head())
    print('-----'*25)

-----------------------------------------------------------------------------------------------------------------------------
      Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  Absences  \
642    18       1          0                  4        18.118879        24   
1752   16       1          3                  2         2.469271        19   
1401   17       0          2                  3         3.456401        25   
2032   17       1          0                  1        15.838131        25   
990    16       0          0                  0         1.045373        19   

      Tutoring  ParentalSupport  Extracurricular  Sports  Music  Volunteering  \
642          1                0                1       1      0             0   
1752         0                1                0       0      0             1   
1401         0                0                0       0      0             0   
2032         1                2                0       0      0             0   


### *Model Initialization

In [96]:
model = KNeighborsClassifier(n_neighbors=5)

### *Model Training

In [99]:
model.fit(x_train,y_train)

### *Hyper Parameter tuning

# evaluation & Result:

In [103]:
y_pred = model.predict(x_test)
print(y_pred)

[4 2 3 1 4 2 4 4 4 4 4 4 1 2 3 4 4 4 4 3 3 4 4 1 4 4 3 2 4 4 4 4 1 1 4 4 1
 2 2 4 2 1 4 4 4 2 2 4 2 4 2 1 4 4 3 0 2 0 4 4 4 4 3 4 4 3 1 2 4 1 1 4 1 2
 4 2 4 4 4 4 4 1 3 4 4 4 1 4 4 4 2 4 4 4 4 4 1 4 4 4 4 3 4 4 4 3 2 4 4 4 4
 4 1 4 4 4 4 1 4 3 4 1 1 4 3 4 3 4 4 2 4 0 2 4 4 4 3 4 4 4 3 4 2 4 2 4 4 2
 2 4 2 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 1 4 0 2 4 4 3 3 4 4 4 1 4 4 2 1
 1 1 4 4 4 4 3 4 4 4 4 4 3 2 2 4 2 4 3 2 4 4 2 3 4 4 4 3 2 2 4 1 2 4 1 4 2
 1 4 4 4 1 2 4 2 4 4 2 4 1 2 4 4 1 4 1 4 4 2 4 4 3 4 4 4 4 1 2 4 2 4 4 3 4
 4 4 4 4 4 4 2 2 4 2 3 4 1 4 4 4 2 4 4 4 2 1 2 3 4 1 3 4 4 4 4 1 4 2 4 2 4
 2 4 4 2 2 3 4 0 3 2 4 3 4 4 4 3 4 4 4 4 2 2 2 4 4 4 4 4 2 4 4 4 3 4 4 2 3
 2 2 2 3 4 4 4 4 4 4 4 4 3 1 1 4 1 1 3 4 2 4 4 2 4 4 3 2 0 2 4 4 1 0 2 2 3
 4 3 3 3 3 4 4 3 1 4 3 3 4 2 2 4 3 4 2 3 3 4 4 3 4 2 2 1 3 2 2 3 2 1 3 2 4
 1 3 2 4 4 3 3 1 3 4 1 2 4 4 4 3 4 4 4 4 2 3 1 2 4 4 1 3 2 4 1 3 4 4 1 2 1
 1 1 4 2 4 4 3 4 2 3 1 4 4 4 4 2 4 4 4 4 3 4 4 2 1 3 4 4 4 4 4 4 2 2 4]


In [104]:
accuracy=accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification = classification_report(y_test, y_pred)

print(f'accuracy_score:  {accuracy}')
print(f'confusion_matrix:\n  {confusion}')
print(f'classification_report:\n  {classification}')


accuracy_score:  0.6659707724425887
confusion_matrix:
  [[  2  14   3   1   2]
 [  5  22  16   1   5]
 [  0  18  47  16   4]
 [  0   2  21  31  32]
 [  0   1   3  16 217]]
classification_report:
                precision    recall  f1-score   support

           0       0.29      0.09      0.14        22
           1       0.39      0.45      0.42        49
           2       0.52      0.55      0.54        85
           3       0.48      0.36      0.41        86
           4       0.83      0.92      0.87       237

    accuracy                           0.67       479
   macro avg       0.50      0.47      0.47       479
weighted avg       0.64      0.67      0.65       479



In [105]:
new_data ={
    'StudentID': [1002],
    'Age':[17],
    'Gender':[1],
    'Ethnicity': [0],
    'ParentalEducation': [2],
    'StudyTimeWeekly': [19.833723],
    'Absences': [7],
    'Tutoring': [1],
    'ParentalSupport': [2],
    'Extracurricular': [0],
    'Sports': [0],
    'Music': [1],
    'Volunteering': [2],
    'GPA': [2.929196]							
} 

df = pd.DataFrame(new_data)


# Drop the 'Id' column
new_X = df.drop(columns=['StudentID'])

# # Apply the same scaler used for the training data
# new_X_scaled = scaler.transform(new_X)

# # Convert the scaled data back to a DataFrame
# new_df_scaled = pd.DataFrame(new_X_scaled, columns=new_X.columns)

new_predictions = model.predict(new_X)

# Add predictions to the new data
df['GradeClass'] = new_predictions

print(df)

   StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
0       1002   17       1          0                  2        19.833723   

   Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
0         7         1                2                0       0      1   

   Volunteering       GPA  GradeClass  
0             2  2.929196           1  


## Hyper Parameter Tuning

In [110]:
model_tuned=KNeighborsClassifier()

In [112]:
params = {
     'n_neighbors' : [i for i in np.arange(0,5,1) ],
     'algorithm' : ['ball_tree', 'kd_tree'],
    'weights' : ['uniform', 'distance']
}

In [114]:
params_kNeigh = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params)

In [116]:
start = pd.Timestamp.now()
params_kNeigh.fit(x_train,y_train)
end = pd.Timestamp.now()-start

print(f'Time taken for Grid Cv {end} run time')

Time taken for Grid Cv 0 days 00:00:01.706556 run time


20 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\neighbors\_classification.py", line 213, in fit
    self._validate_params()
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameter

In [117]:
params

{'n_neighbors': [0, 1, 2, 3, 4],
 'algorithm': ['ball_tree', 'kd_tree'],
 'weights': ['uniform', 'distance']}

In [118]:
params_kNeigh.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 4, 'weights': 'distance'}

In [119]:
tuned_model=KNeighborsClassifier(algorithm= 'ball_tree', n_neighbors = 4, weights = 'distance')

In [120]:
tuned_model.fit(x_train, y_train)

In [121]:
y1_pred=tuned_model.predict(x_test)

In [127]:
y1_pred

array([4, 2, 3, 1, 4, 2, 4, 4, 4, 4, 3, 4, 1, 2, 3, 4, 4, 4, 4, 3, 3, 4,
       4, 1, 4, 4, 3, 2, 4, 4, 4, 4, 2, 1, 4, 4, 1, 2, 2, 4, 2, 1, 4, 4,
       4, 2, 2, 4, 2, 4, 2, 1, 4, 4, 3, 0, 3, 0, 4, 3, 4, 4, 3, 4, 4, 3,
       1, 3, 4, 0, 1, 3, 1, 2, 4, 1, 4, 4, 4, 4, 4, 1, 2, 4, 4, 4, 1, 4,
       4, 4, 2, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 3, 4, 4, 4, 3, 2, 4, 4, 4,
       4, 4, 0, 4, 4, 4, 4, 2, 4, 4, 4, 1, 1, 4, 4, 4, 3, 4, 4, 2, 4, 0,
       2, 4, 4, 4, 3, 4, 4, 4, 4, 3, 1, 3, 2, 4, 4, 3, 2, 4, 2, 4, 0, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 1, 4, 1, 2, 4, 4, 3,
       3, 4, 4, 4, 1, 4, 4, 2, 1, 2, 1, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3,
       2, 2, 4, 2, 4, 2, 2, 3, 4, 2, 4, 4, 4, 4, 3, 2, 4, 4, 2, 2, 4, 1,
       4, 2, 1, 4, 4, 4, 1, 2, 4, 3, 4, 4, 2, 4, 1, 3, 4, 4, 2, 4, 0, 4,
       4, 2, 4, 4, 3, 4, 4, 4, 4, 1, 2, 4, 3, 4, 4, 3, 4, 4, 3, 4, 4, 4,
       4, 2, 3, 4, 2, 3, 4, 1, 4, 4, 4, 2, 4, 4, 4, 2, 1, 2, 3, 4, 1, 3,
       4, 4, 4, 4, 0, 4, 2, 4, 2, 4, 2, 4, 4, 2, 2,

In [129]:
accuracy=accuracy_score(y_test, y1_pred)
confusion = confusion_matrix(y_test, y1_pred)
classification = classification_report(y_test, y1_pred)

print(f'accuracy_score:  {accuracy}')
print(f'confusion_matrix:\n  {confusion}')
print(f'classification_report:\n  {classification}')

accuracy_score:  0.6638830897703549
confusion_matrix:
  [[  4  12   2   2   2]
 [  5  22  16   2   4]
 [  2  15  45  16   7]
 [  0   1  20  34  31]
 [  0   1   2  21 213]]
classification_report:
                precision    recall  f1-score   support

           0       0.36      0.18      0.24        22
           1       0.43      0.45      0.44        49
           2       0.53      0.53      0.53        85
           3       0.45      0.40      0.42        86
           4       0.83      0.90      0.86       237

    accuracy                           0.66       479
   macro avg       0.52      0.49      0.50       479
weighted avg       0.65      0.66      0.65       479



In [132]:
new_data ={
    'StudentID': [1002],
    'Age':[17],
    'Gender':[1],
    'Ethnicity': [0],
    'ParentalEducation': [2],
    'StudyTimeWeekly': [19.833723],
    'Absences': [7],
    'Tutoring': [1],
    'ParentalSupport': [2],
    'Extracurricular': [0],
    'Sports': [0],
    'Music': [1],
    'Volunteering': [2],
    'GPA': [2.929196]							
} 

df = pd.DataFrame(new_data)


# Drop the 'Id' column
new_X = df.drop(columns=['StudentID'])

# # Apply the same scaler used for the training data
# new_X_scaled = scaler.transform(new_X)

# # Convert the scaled data back to a DataFrame
# new_df_scaled = pd.DataFrame(new_X_scaled, columns=new_X.columns)

new_predictions = tuned_model.predict(new_X)

# Add predictions to the new data
df['GradeClass'] = new_predictions

print(df)

   StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
0       1002   17       1          0                  2        19.833723   

   Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
0         7         1                2                0       0      1   

   Volunteering       GPA  GradeClass  
0             2  2.929196           2  
