In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error
import plotly.express as px

In [10]:
data = pd.read_csv('dane_do_modelu.csv', sep=';')

X = data.drop(
    'G3',
    axis=1
)
y = data['G3']
data

Unnamed: 0,G3,address,reason,Mjob,higher,Fjob,goout,paid,Fedu,studytime,internet,romantic,failures,age
0,6,0,2,3,1,0,4,0,4,1,0,0,0,18
1,6,0,2,3,1,4,3,0,1,1,1,0,0,17
2,10,0,3,3,1,4,2,1,1,1,1,0,3,15
3,15,0,0,1,1,2,2,1,2,0,1,1,0,15
4,10,0,0,4,1,4,2,1,3,1,0,0,0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,9,0,2,2,1,2,4,1,2,1,0,0,2,19
391,16,0,2,2,1,2,5,0,1,1,1,0,0,17
392,7,1,2,4,1,4,3,0,1,1,0,0,3,19
393,10,1,2,2,1,4,1,0,2,1,1,0,0,18


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=308289
)

In [12]:
model = RandomForestClassifier(
    n_estimators=50,
    criterion='entropy',
    max_features='log2',
    min_weight_fraction_leaf=0.1,
    ccp_alpha=0.01,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    max_samples=150,
    random_state=308289
)

In [13]:
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [14]:
def accuracy_with_derogation(y, y_pred):
    return sum(1 if abs(i - j) <= 1 else 0 
               for i, j in zip(y, y_pred)) / len(y_pred)


def get_measure(y, y_pred, set):
    print(f'Zbiór {set}')
    print(f'Trafność: {accuracy_score(y, y_pred)}')
    print(f'MAE: {mean_absolute_error(y, y_pred)}')
    print(f'Accuracy with derogation: {accuracy_with_derogation(y, y_pred)}')

In [15]:
get_measure(y_test, y_test_pred, 'Testowy')
print('\n')
get_measure(y_train, y_train_pred, 'Uczący')

Zbiór Testowy
Trafność: 0.17557251908396945
MAE: 3.6106870229007635
Accuracy with derogation: 0.366412213740458


Zbiór Uczący
Trafność: 0.22727272727272727
MAE: 3.5303030303030303
Accuracy with derogation: 0.35984848484848486


In [16]:
features_importance = pd.DataFrame(
    zip(model.feature_importances_, data.drop('G3', axis=1).columns), 
    columns=['importance', 'variable']
)

fig = px.bar(features_importance, x='importance', y='variable')
fig.update_layout(yaxis={'categoryorder':'total ascending'}, title='Model klasyfikacji')