In [66]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [67]:
df = pd.read_csv('student-por-extended.csv')

In [68]:
correlation_matrix = df.corr()

In [69]:
correlation_sums = correlation_matrix.abs().sum(axis=1)

top_features = correlation_sums.sort_values(ascending=False).head(10).index

print("Топ характеристики за кореляцією:", top_features)

Топ характеристики за кореляцією: Index(['G1', 'G2', 'G3', 'Medu', 'higher_yes', 'Fedu', 'school_MS', 'failures',
       'studytime', 'internet_yes'],
      dtype='object')


In [70]:
selected_features = ['G1', 'G2', 'G3', 'Medu', 'higher_yes', 'Fedu', 'failures', 'studytime', 'internet_yes']
df_selected = df[selected_features]

df_selected.to_csv('selected_features_dataset.csv', index=False)

In [71]:
df = pd.read_csv('selected_features_dataset.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1352 entries, 0 to 1351
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   G1            1352 non-null   int64
 1   G2            1352 non-null   int64
 2   G3            1352 non-null   int64
 3   Medu          1352 non-null   int64
 4   higher_yes    1352 non-null   bool 
 5   Fedu          1352 non-null   int64
 6   failures      1352 non-null   int64
 7   studytime     1352 non-null   int64
 8   internet_yes  1352 non-null   bool 
dtypes: bool(2), int64(7)
memory usage: 76.7 KB


In [72]:
df.head(5)

Unnamed: 0,G1,G2,G3,Medu,higher_yes,Fedu,failures,studytime,internet_yes
0,0,11,11,4,True,4,0,2,False
1,9,11,11,1,True,1,0,2,True
2,12,13,12,1,True,1,0,2,True
3,14,14,14,4,True,2,0,3,True
4,11,13,13,3,True,3,0,2,False


In [73]:
X = df.drop(columns=['G3'])
X.head()

Unnamed: 0,G1,G2,Medu,higher_yes,Fedu,failures,studytime,internet_yes
0,0,11,4,True,4,0,2,False
1,9,11,1,True,1,0,2,True
2,12,13,1,True,1,0,2,True
3,14,14,4,True,2,0,3,True
4,11,13,3,True,3,0,2,False


In [74]:
y = df['G3']
y.head()

Unnamed: 0,G3
0,11
1,11
2,12
3,14
4,13


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(1081, 8) (271, 8)


In [76]:
rf_pred = RandomForestRegressor()
rf_pred.fit(X_train,y_train)
rf_pred.score(X_test,y_test)

0.9444465877846018

In [77]:
xgb_pred = XGBRegressor()
xgb_pred.fit(X_train,y_train)
xgb_pred.score(X_test,y_test)

0.942564070224762

In [78]:
dt_pred = DecisionTreeRegressor()
dt_pred.fit(X_train,y_train)
dt_pred.score(X_test,y_test)

0.9056911063630643

In [79]:
import pickle

with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_pred, file)

print("Модель успішно збережено як 'random_forest_model.pkl'!")

Модель успішно збережено як 'random_forest_model.pkl'!


In [80]:
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Оцінка завантаженої моделі на тестових даних:", loaded_model.score(X_test, y_test))

Оцінка завантаженої моделі на тестових даних: 0.9444465877846018


In [81]:
with open('xgb_regressor_model.pkl', 'wb') as file:
    pickle.dump(xgb_pred, file)

print("Модель успішно збережено як 'xgb_regressor_model.pkl'!")

Модель успішно збережено як 'xgb_regressor_model.pkl'!


In [82]:
with open('xgb_regressor_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Оцінка завантаженої моделі на тестових даних:", loaded_model.score(X_test, y_test))

Оцінка завантаженої моделі на тестових даних: 0.942564070224762


In [83]:
with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(dt_pred, file)

print("Модель успішно збережено як 'decision_tree_model.pkl'!")

Модель успішно збережено як 'decision_tree_model.pkl'!


In [84]:
with open('decision_tree_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Оцінка завантаженої моделі на тестових даних:", loaded_model.score(X_test, y_test))

Оцінка завантаженої моделі на тестових даних: 0.9056911063630643
