In [39]:
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score

<h2>1. Load the Data</h2>

In [2]:
data = pd.read_csv('student-mat.csv')
data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


<h2>2. Preprocess the Data (Classification)</h2>

In [9]:
# Change from categorical to numerical
label_encoder = LabelEncoder()
data['school'] = label_encoder.fit_transform(data['school'])
data['sex'] = label_encoder.fit_transform(data['sex'])
data['address'] = label_encoder.fit_transform(data['address'])
data['Pstatus'] = label_encoder.fit_transform(data['Pstatus'])

In [12]:
X = data.drop(['Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famsize'], axis=1)
y = data['sex']

In [13]:
X

Unnamed: 0,school,sex,age,address,Pstatus,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,4,4,2,2,0,4,3,4,1,1,3,6,5,6,6
1,0,0,17,1,1,1,1,1,2,0,5,3,3,1,1,3,4,5,5,6
2,0,0,15,1,1,1,1,1,2,3,4,3,2,2,3,3,10,7,8,10
3,0,0,15,1,1,4,2,1,3,0,3,2,2,1,1,5,2,15,14,15
4,0,0,16,1,1,3,3,1,2,0,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,1,20,1,0,2,2,1,2,2,5,5,4,4,5,4,11,9,9,9
391,1,1,17,1,1,3,1,2,1,0,2,4,5,3,4,2,3,14,16,16
392,1,1,21,0,1,1,1,1,1,3,5,5,3,3,3,3,3,10,8,7
393,1,1,18,0,1,3,2,3,1,0,4,4,1,3,4,5,0,11,12,10


<h2>3. Construct and Train the Model (Classification)</h2>

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Create and train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the standardized test set
y_pred = clf.predict(X_test)

<h2>4. Evaluate the Model (Classification)</h2>

In [23]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [24]:
print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Confusion Matrix:
[[38  0]
 [ 0 41]]


In [28]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00        41

    accuracy                           1.00        79
   macro avg       1.00      1.00      1.00        79
weighted avg       1.00      1.00      1.00        79



<h2>5. Preprocess the Data (Regression)</h2>

In [29]:
# Change from categorical to numerical
label_encoder = LabelEncoder()
data['school'] = label_encoder.fit_transform(data['school'])
data['sex'] = label_encoder.fit_transform(data['sex'])
data['address'] = label_encoder.fit_transform(data['address'])
data['Pstatus'] = label_encoder.fit_transform(data['Pstatus'])

In [30]:
X = data.drop(['Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famsize'], axis=1)
y = data['sex']

<h2>6. Construct and Train the Model (Regression)</h2>

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Create and train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the standardized test set
y_pred = clf.predict(X_test)

<h2>7. Evaluate the Model (Regression)</h2>

In [40]:
y_pred = clf.predict(X_test)

In [41]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [42]:
print("Mean Absolute Error (MAE): {:.4f}".format(mae))
print("Mean Squared Error (MSE): {:.4f}".format(mse))
print("R-squared (R²): {:.4f}".format(r2))

Mean Absolute Error (MAE): 0.0000
Mean Squared Error (MSE): 0.0000
R-squared (R²): 1.0000
