In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.feature_selection import VarianceThreshold
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import ADASYN, SMOTE
from collections import Counter

In [3]:
df = pd.read_csv('student-por.csv')

df = pd.get_dummies(df, drop_first=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                649 non-null    int64
 1   Medu               649 non-null    int64
 2   Fedu               649 non-null    int64
 3   traveltime         649 non-null    int64
 4   studytime          649 non-null    int64
 5   failures           649 non-null    int64
 6   famrel             649 non-null    int64
 7   freetime           649 non-null    int64
 8   goout              649 non-null    int64
 9   Dalc               649 non-null    int64
 10  Walc               649 non-null    int64
 11  health             649 non-null    int64
 12  absences           649 non-null    int64
 13  G1                 649 non-null    int64
 14  G2                 649 non-null    int64
 15  G3                 649 non-null    int64
 16  school_MS          649 non-null    bool 
 17  sex_M           

In [4]:
X = df.drop('G3', axis=1)
y = df['G3']

print('Original dataset shape %s' % Counter(y))

X_filtered = X[~y.isin([1, 5, 19, 6])]
y_filtered = y[~y.isin([1, 5, 19, 6])]

print(Counter(y_filtered))

smote = SMOTE(sampling_strategy='auto', random_state=42)

X_res, y_res = smote.fit_resample(X_filtered, y_filtered)

print('Resampled dataset shape %s' % Counter(y_res))

resampled_df = pd.DataFrame(X_res, columns=X.columns)
resampled_df['G3'] = y_res

resampled_df.to_csv('student-por-extended.csv', index=False)

Original dataset shape Counter({11: 104, 10: 97, 13: 82, 12: 72, 14: 63, 15: 49, 16: 36, 9: 35, 8: 35, 17: 29, 18: 15, 0: 15, 7: 10, 6: 3, 19: 2, 1: 1, 5: 1})
Counter({11: 104, 10: 97, 13: 82, 12: 72, 14: 63, 15: 49, 16: 36, 9: 35, 8: 35, 17: 29, 18: 15, 0: 15, 7: 10})
Resampled dataset shape Counter({11: 104, 12: 104, 14: 104, 13: 104, 17: 104, 15: 104, 7: 104, 10: 104, 16: 104, 9: 104, 8: 104, 18: 104, 0: 104})


In [5]:
df = pd.read_csv('student-por-extended.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1352 entries, 0 to 1351
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                1352 non-null   int64
 1   Medu               1352 non-null   int64
 2   Fedu               1352 non-null   int64
 3   traveltime         1352 non-null   int64
 4   studytime          1352 non-null   int64
 5   failures           1352 non-null   int64
 6   famrel             1352 non-null   int64
 7   freetime           1352 non-null   int64
 8   goout              1352 non-null   int64
 9   Dalc               1352 non-null   int64
 10  Walc               1352 non-null   int64
 11  health             1352 non-null   int64
 12  absences           1352 non-null   int64
 13  G1                 1352 non-null   int64
 14  G2                 1352 non-null   int64
 15  school_MS          1352 non-null   bool 
 16  sex_M              1352 non-null   bool 
 17  address_U     

In [6]:
# Обчислюємо кореляційну матрицю
correlation_matrix = df.corr()

In [None]:
# Сума абсолютних кореляцій для кожної характеристики
correlation_sums = correlation_matrix.abs().sum(axis=1)

# Вибираємо топ 7-8 характеристик
top_features = correlation_sums.sort_values(ascending=False).head(10).index

# Виводимо назви обраних характеристик
print("Топ характеристики за кореляцією:", top_features)

Топ характеристики за кореляцією: Index(['G1', 'G2', 'G3', 'Medu', 'higher_yes', 'Fedu', 'school_MS', 'failures',
       'studytime', 'internet_yes'],
      dtype='object')


In [None]:
selected_features = ['G1', 'G2', 'G3', 'Medu', 'higher_yes', 'Fedu', 'failures', 'studytime', 'internet_yes']
df_selected = df[selected_features]

df_selected.to_csv('selected_features_dataset.csv', index=False)

In [7]:
df = pd.read_csv('selected_features_dataset.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1352 entries, 0 to 1351
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   G1            1352 non-null   int64
 1   G2            1352 non-null   int64
 2   G3            1352 non-null   int64
 3   Medu          1352 non-null   int64
 4   higher_yes    1352 non-null   bool 
 5   Fedu          1352 non-null   int64
 6   failures      1352 non-null   int64
 7   studytime     1352 non-null   int64
 8   internet_yes  1352 non-null   bool 
dtypes: bool(2), int64(7)
memory usage: 76.7 KB


In [None]:
df.head(5)

Unnamed: 0,G1,G2,G3,Medu,higher_yes,Fedu,failures,studytime,internet_yes
0,0,11,11,4,True,4,0,2,False
1,9,11,11,1,True,1,0,2,True
2,12,13,12,1,True,1,0,2,True
3,14,14,14,4,True,2,0,3,True
4,11,13,13,3,True,3,0,2,False


In [8]:
X = df.drop(columns=['G3'])
X.head()

Unnamed: 0,G1,G2,Medu,higher_yes,Fedu,failures,studytime,internet_yes
0,0,11,4,True,4,0,2,False
1,9,11,1,True,1,0,2,True
2,12,13,1,True,1,0,2,True
3,14,14,4,True,2,0,3,True
4,11,13,3,True,3,0,2,False


In [9]:
y = df['G3']
y.head()

Unnamed: 0,G3
0,11
1,11
2,12
3,14
4,13


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(1081, 8) (271, 8)


In [16]:
rf_pred = RandomForestRegressor()
rf_pred.fit(X_train,y_train)
rf_pred.score(X_test,y_test)

0.9457398863006963

In [17]:
xgb_pred = XGBRegressor()
xgb_pred.fit(X_train,y_train)
xgb_pred.score(X_test,y_test)

0.942564070224762

In [18]:
dt_pred = DecisionTreeRegressor()
dt_pred.fit(X_train,y_train)
dt_pred.score(X_test,y_test)

0.8926747349717044

In [19]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(RandomForestRegressor(), X, y, cv=cv)

array([0.91750312, 0.91789742, 0.93547688, 0.93101009, 0.92937881])

In [20]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(XGBRegressor(), X, y, cv=cv)

array([0.91099703, 0.92891878, 0.93540698, 0.93081874, 0.94366539])

In [21]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(DecisionTreeRegressor(), X, y, cv=cv)

array([0.89605282, 0.88905898, 0.89021259, 0.87006287, 0.92436672])

In [22]:
import pickle

with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_pred, file)

print("Модель успішно збережено як 'random_forest_model.pkl'!")

Модель успішно збережено як 'random_forest_model.pkl'!


In [23]:
# Завантаження моделі з файлу
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Перевірка завантаженої моделі
print("Оцінка завантаженої моделі на тестових даних:", loaded_model.score(X_test, y_test))

Оцінка завантаженої моделі на тестових даних: 0.9457398863006963


In [24]:
with open('../server/artifacts/xgb_regressor_model.pkl', 'wb') as file:
    pickle.dump(xgb_pred, file)

print("Модель успішно збережено як 'xgb_regressor_model.pkl'!")

Модель успішно збережено як 'xgb_regressor_model.pkl'!


In [25]:
with open('../server/artifacts/xgb_regressor_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Оцінка завантаженої моделі на тестових даних:", loaded_model.score(X_test, y_test))

Оцінка завантаженої моделі на тестових даних: 0.942564070224762


In [26]:
with open('../server/artifacts/decision_tree_model.pkl', 'wb') as file:
    pickle.dump(dt_pred, file)

print("Модель успішно збережено як 'decision_tree_model.pkl'!")

Модель успішно збережено як 'decision_tree_model.pkl'!


In [27]:
with open('../server/artifacts/decision_tree_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Оцінка завантаженої моделі на тестових даних:", loaded_model.score(X_test, y_test))

Оцінка завантаженої моделі на тестових даних: 0.8926747349717044
