In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import plotly.express as px
from sklearn.metrics import mean_absolute_error, accuracy_score

In [74]:
data = pd.read_csv('dane_do_modelu.csv', sep=';')

X = data.drop(
    'G3',
    axis=1
)
y = data.G3

In [75]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=308289
)

In [76]:
model = RandomForestRegressor(
    n_estimators=550,
    ccp_alpha=0.01,
    max_features='sqrt',
    min_weight_fraction_leaf=0.01,
    n_jobs=-1,
    bootstrap=True,
    oob_score=True,
    verbose=-1,
    max_samples=200,
    random_state=308289
)

In [77]:
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

y_train_pred = np.rint(y_train_pred).astype('int')
y_test_pred = np.rint(y_test_pred).astype('int')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 550 out of 550 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 550 out of 550 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 550 out of 550 | elapsed:    0.0s finished


In [78]:
def accuracy_with_derogation(y, y_pred):
    return sum(1 if abs(i - j) <= 1 else 0 
               for i, j in zip(y, y_pred)) / len(y_pred)


def get_measure(y, y_pred, set):
    print(f'Zbiór {set}')
    print(f'Trafność: {accuracy_score(y, y_pred)}')
    print(f'MAE: {mean_absolute_error(y, y_pred)}')
    print(f'Accuracy with derogation: {accuracy_with_derogation(y, y_pred)}')

In [79]:
get_measure(y_test, y_test_pred, 'Testowy')
print('\n')
get_measure(y_train, y_train_pred, 'Uczący')

Zbiór Testowy
Trafność: 0.183206106870229
MAE: 2.9923664122137406
Accuracy with derogation: 0.3969465648854962


Zbiór Uczący
Trafność: 0.15151515151515152
MAE: 2.340909090909091
Accuracy with derogation: 0.4393939393939394


In [80]:
features_importance = pd.DataFrame(
    zip(model.feature_importances_, data.drop('G3', axis=1).columns), 
    columns=['importance', 'variable']
)

fig = px.bar(features_importance, x='importance', y='variable')
fig.update_layout(yaxis={'categoryorder':'total ascending'}, title='Model regresji')