# Data PreProcessing

### *Import Libraries

In [4]:
#Define data set
import numpy as np
import pandas as pd

#Define scalling data
from sklearn.preprocessing import StandardScaler,LabelEncoder

#define Data viz
import matplotlib.pyplot as plt
import seaborn as sns

#Define Model Built
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

### *Importing DATA

In [7]:
df = pd.read_csv('./Data/Student_performance_data _.csv')

### *EDA

In [10]:
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [14]:
df.columns

Index(['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA',
       'GradeClass'],
      dtype='object')

In [16]:
df.describe

<bound method NDFrame.describe of       StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
0          1001   17       1          0                  2        19.833723   
1          1002   18       0          0                  1        15.408756   
2          1003   15       0          2                  3         4.210570   
3          1004   17       1          0                  3        10.028829   
4          1005   17       1          0                  2         4.672495   
...         ...  ...     ...        ...                ...              ...   
2387       3388   18       1          0                  3        10.680555   
2388       3389   17       0          0                  1         7.583217   
2389       3390   16       1          0                  2         6.805500   
2390       3391   16       1          1                  0        12.416653   
2391       3392   16       1          0                  2        17.819907   

      Absences  T

In [18]:
df.dtypes

StudentID              int64
Age                    int64
Gender                 int64
Ethnicity              int64
ParentalEducation      int64
StudyTimeWeekly      float64
Absences               int64
Tutoring               int64
ParentalSupport        int64
Extracurricular        int64
Sports                 int64
Music                  int64
Volunteering           int64
GPA                  float64
GradeClass           float64
dtype: object

### *Handling Missing Values

In [21]:
df.isna().sum()

StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64

### *Feature Engineering

In [24]:
df.GradeClass = df.GradeClass.astype('object')

In [26]:
df.dtypes

StudentID              int64
Age                    int64
Gender                 int64
Ethnicity              int64
ParentalEducation      int64
StudyTimeWeekly      float64
Absences               int64
Tutoring               int64
ParentalSupport        int64
Extracurricular        int64
Sports                 int64
Music                  int64
Volunteering           int64
GPA                  float64
GradeClass            object
dtype: object

In [28]:
df.head(1)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0


### *Handling Outliers

### *Data Vizulisation

# Model building

### *Train Test Split

In [36]:
x = df.drop(columns=['StudentID','GradeClass'])
y = df['GradeClass']

In [38]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [40]:
for i in x_train,y_train,x_test,y_test :
    print('-----'*25)
    print(i.head())
    print('-----'*25)

-----------------------------------------------------------------------------------------------------------------------------
      Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  Absences  \
642    18       1          0                  4        18.118879        24   
1752   16       1          3                  2         2.469271        19   
1401   17       0          2                  3         3.456401        25   
2032   17       1          0                  1        15.838131        25   
990    16       0          0                  0         1.045373        19   

      Tutoring  ParentalSupport  Extracurricular  Sports  Music  Volunteering  \
642          1                0                1       1      0             0   
1752         0                1                0       0      0             1   
1401         0                0                0       0      0             0   
2032         1                2                0       0      0             0   


### *Model Initialization

In [43]:
model = LinearRegression()

### *Model Training

In [46]:
model.fit(x_train,y_train)

### *Hyper Parameter tuning

# evaluation & Result:

In [50]:
y_pred = model.predict(x_test)
print(y_pred)

[3.67786878 1.78069216 2.86699311 1.24359547 4.66764951 2.25459738
 2.96493866 2.66096251 2.61605697 4.22943703 2.58970013 4.11339926
 1.91317549 2.51753908 2.07198351 4.70018653 3.80339147 3.29656461
 2.86618694 2.49304657 3.08553722 3.41829248 3.13639578 1.61301277
 3.17836611 3.99180737 3.4718908  1.80538999 2.63021908 2.57230173
 2.88622201 4.12993062 1.98457271 1.27146879 3.21735127 3.0100534
 1.54479259 1.99578652 1.49265667 4.5536996  2.16752811 1.15219507
 3.81017517 2.83959503 4.01290383 2.55886696 2.01888771 3.54446535
 2.19966507 3.4058196  1.52151597 1.45046564 3.49845045 3.7091477
 3.00349106 1.3377347  2.54847087 1.48682068 4.25226716 2.73408277
 3.15583396 2.71391727 2.31703939 3.53732699 3.95833281 2.88027698
 1.71556407 2.50646755 3.79836891 1.52759136 1.2358123  3.75939681
 1.8346735  2.31285804 4.490379   2.14969002 2.91372579 3.13889649
 4.69120172 3.55904793 2.89366147 1.58928058 2.55203426 3.2300333
 3.41960779 4.45178794 0.83855745 3.7368312  3.56992056 3.2623822

In [52]:
mae= mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2_score = r2_score(y_pred, y_test)

print(f'mean_absolute_error:  {mae}')
print(f'mean_squared_error:\n  {mse}')
print(f'r2_score:\n  {r2_score}')


mean_absolute_error:  0.5423598590717797
mean_squared_error:
  0.5437997812834355
r2_score:
  0.41220122532179304


In [54]:
new_data ={
    'StudentID': [1002],
    'Age':[17],
    'Gender':[1],
    'Ethnicity': [0],
    'ParentalEducation': [2],
    'StudyTimeWeekly': [19.833723],
    'Absences': [7],
    'Tutoring': [1],
    'ParentalSupport': [2],
    'Extracurricular': [0],
    'Sports': [0],
    'Music': [1],
    'Volunteering': [2],
    'GPA': [2.929196]							
} 

df = pd.DataFrame(new_data)


# Drop the 'Id' column
new_X = df.drop(columns=['StudentID'])

# # Apply the same scaler used for the training data
# new_X_scaled = scaler.transform(new_X)

# # Convert the scaled data back to a DataFrame
# new_df_scaled = pd.DataFrame(new_X_scaled, columns=new_X.columns)

new_predictions = model.predict(new_X)

# Add predictions to the new data
df['GradeClass'] = new_predictions

print(df)

   StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
0       1002   17       1          0                  2        19.833723   

   Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
0         7         1                2                0       0      1   

   Volunteering       GPA  GradeClass  
0             2  2.929196    2.137397  


## Hyper Parameter Tuning