In [54]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import matplotlib
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams["figure.figsize"] = (12, 8)

print("Block Executed")

Block Executed


In [55]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Step 1: Importing Dataset

In [56]:
df = pd.read_csv('Student.csv', skipinitialspace=True)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [57]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


## Step 2: Data Information

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [59]:
df.isnull().sum()

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64

In [60]:
df.shape

(6607, 20)

## Step 3: Data Preprocessing

#### Step 3(1): Finding Missing Values

In [61]:
miss = df.isnull().sum()
print(miss)

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64


#### Step 3(2): Replace missing values with the most frequent in the column.

In [62]:
df['Teacher_Quality'].fillna(df['Teacher_Quality'].mode()[0], inplace=True)
df['Parental_Education_Level'].fillna(df['Parental_Education_Level'].mode()[0], inplace=True)
df['Distance_from_Home'].fillna(df['Distance_from_Home'].mode()[0], inplace=True)

#### Step 3(3): Encoding

In [63]:
label_encoder = LabelEncoder()

categorical_features = ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                        'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 
                        'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level',
                        'Distance_from_Home', 'Gender']
for feature in categorical_features:
    df[feature] = label_encoder.fit_transform(df[feature])

df.head

<bound method NDFrame.head of       Hours_Studied  Attendance  Parental_Involvement  Access_to_Resources  \
0                23          84                     1                    0   
1                19          64                     1                    2   
2                24          98                     2                    2   
3                29          89                     1                    2   
4                19          92                     2                    2   
5                19          88                     2                    2   
6                29          84                     2                    1   
7                25          78                     1                    0   
8                17          94                     2                    0   
9                23          98                     2                    2   
10               17          80                     1                    0   
11               17          97   

## Step 4: Splitting to test, validation set

In [64]:
X = df.drop('Exam_Score', axis=1) 
y = df['Exam_Score']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
scaler = MinMaxScaler()

# Fit on the training set and transform it
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)

# Only transform the test set using the fitted scaler
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [67]:
# Building the model
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=128, activation='relu',  kernel_regularizer=tf.keras.regularizers.l2(0.01)))
ann.add(tf.keras.layers.BatchNormalization())
ann.add(tf.keras.layers.Dropout(0.3))

ann.add(tf.keras.layers.Dense(units=1, activation='relu'))

# Compile the model with a loss function appropriate for regression
ann.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
ann.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 881us/step - loss: 4264.2539 - val_loss: 3387.6157
Epoch 2/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 544us/step - loss: 2799.7283 - val_loss: 1097.3680
Epoch 3/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 538us/step - loss: 1473.4241 - val_loss: 696.6340
Epoch 4/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 536us/step - loss: 748.5502 - val_loss: 70.6327
Epoch 5/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 536us/step - loss: 86.3293 - val_loss: 24.5182
Epoch 6/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 523us/step - loss: 29.3073 - val_loss: 8.7362
Epoch 7/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 542us/step - loss: 29.7771 - val_loss: 5.1809
Epoch 8/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 539us/step - loss: 26.7645 - val_loss

<keras.src.callbacks.history.History at 0x168a9b850>

In [68]:
y_pred = ann.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:', mae)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 740us/step
Mean Absolute Error: 0.8583749087242785
