In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
data = pd.read_csv("student.csv", sep=",")

In [3]:
# Display the first few rows of the dataset
print(data.head())

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  feespaid  ecactivities  internet  freetime goout health absences  G1  G2  G3  
0       no            no        no         3     4      3        6   5   6   6  
1       no            no       yes         3     3      3        4   5   5   6  
2      yes            no       yes         3     2      3       10   7   8  10  
3      yes           yes       yes         2     2      5        2  15  14  15  
4      yes            no        no         3     2      5        4   6  10  10  

[5 rows x 26 columns]


In [4]:
# Perform exploratory data analysis (EDA)
print("Exploratory Data Analysis:")
# Display information about the dataset (columns, data types, non-null counts)
print(data.info())
# Display summary statistics
print(data.describe())

Exploratory Data Analysis:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   school        395 non-null    object
 1   sex           395 non-null    object
 2   age           395 non-null    int64 
 3   address       395 non-null    object
 4   famsize       395 non-null    object
 5   Pstatus       395 non-null    object
 6   Medu          395 non-null    int64 
 7   Fedu          395 non-null    int64 
 8   Mjob          393 non-null    object
 9   Fjob          395 non-null    object
 10  guardian      395 non-null    object
 11  traveltime    395 non-null    int64 
 12  studytime     395 non-null    int64 
 13  failures      395 non-null    int64 
 14  Ssupport      395 non-null    object
 15  Gsupport      395 non-null    object
 16  feespaid      395 non-null    object
 17  ecactivities  395 non-null    object
 18  internet      395 non-n

In [5]:
# Select specific columns for analysis (G1, G2, G3, studytime, failures, absences)
data = data[["G1", "G2", "G3", "studytime", "failures", "absences"]]

In [6]:
# Define the target variable (G3) and features
predict = "G3"
X = np.array(data.drop(columns=[predict]))   # Features (excluding G3)
y = np.array(data[predict])  # Labels (G3)

In [7]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [8]:
# Initialize the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(x_train, y_train)

In [9]:

# Make predictions on the test set
y_predictions = rf_model.predict(x_test)

# Evaluate the model
accuracy = rf_model.score(x_test, y_test)
r_squared = r2_score(y_test, y_predictions)
mae = mean_absolute_error(y_test, y_predictions)
mse = mean_squared_error(y_test, y_predictions)
rmse = np.sqrt(mse)

# Display evaluation metrics
print(f"Model Accuracy: {accuracy}")
print(f"Model Accuracy (R-squared): {r_squared}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Model Accuracy: 0.6360912088089655
Model Accuracy (R-squared): 0.6360912088089655
Mean Absolute Error (MAE): 1.5007151785714288
Mean Squared Error (MSE): 5.043775845907739
Root Mean Squared Error (RMSE): 2.2458352223410647


In [10]:
# Display actual vs. predicted values
print("\nActual vs. Predicted Values:")
for i in range(len(y_predictions)):
    print(f"Predicted: {y_predictions[i]:.2f} | Actual: {y_test[i]} | Features: {x_test[i]}")


Actual vs. Predicted Values:
Predicted: 8.69 | Actual: 10 | Features: [10  9  3  0  2]
Predicted: 6.62 | Actual: 9 | Features: [8 7 2 0 6]
Predicted: 15.30 | Actual: 15 | Features: [13 15  3  0  0]
Predicted: 13.04 | Actual: 14 | Features: [13 13  3  0  0]
Predicted: 5.80 | Actual: 10 | Features: [ 7 10  3  1  0]
Predicted: 1.73 | Actual: 0 | Features: [6 7 2 3 0]
Predicted: 2.46 | Actual: 0 | Features: [9 8 2 1 0]
Predicted: 14.15 | Actual: 14 | Features: [12 14  3  0  7]
Predicted: 12.97 | Actual: 11 | Features: [13 13  2  0  2]
Predicted: 13.57 | Actual: 13 | Features: [13 13  2  0 23]
Predicted: 6.49 | Actual: 10 | Features: [10 10  2  0  0]
Predicted: 11.64 | Actual: 11 | Features: [12 11  1  0 16]
Predicted: 15.84 | Actual: 18 | Features: [15 16  3  0  2]
Predicted: 15.23 | Actual: 15 | Features: [13 15  1  0  0]
Predicted: 6.68 | Actual: 10 | Features: [11 10  2  0  0]
Predicted: 3.70 | Actual: 9 | Features: [10 10  3  0  0]
Predicted: 14.10 | Actual: 14 | Features: [14 14  1  