In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

**Задание 1.** Найти данные для задачи классификации или для задачи регрессии.

In [2]:
df = pd.read_csv("data/WHR_2023.csv")
df = df.drop(["country", "region"], axis=1)
df = df.dropna(axis=0)
df.describe()

Unnamed: 0,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,generosity,perceptions_of_corruption
count,136.0,136.0,136.0,136.0,136.0,136.0,136.0
mean,5.544441,1.408919,1.155088,0.366176,0.540912,0.149088,0.146478
std,1.142841,0.433969,0.327263,0.156691,0.149671,0.075993,0.127009
min,1.859,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.7025,1.09775,0.95975,0.2485,0.45875,0.0985,0.05975
50%,5.6935,1.4515,1.2255,0.3895,0.5575,0.1375,0.112
75%,6.3425,1.798,1.40125,0.4875,0.65675,0.19925,0.18825
max,7.804,2.2,1.62,0.702,0.772,0.422,0.561


In [3]:
# Same division into classes as in PR5
bins = [-np.inf, 5, 6, np.inf]
labels = [0, 1, 2]
df['happiness_score'] = pd.cut(df['happiness_score'], bins=bins, labels=labels)

In [4]:
targetName = 'happiness_score'
x = df.drop(columns=[targetName])
y = df[targetName]
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.8, random_state=0)
print(f"Training: {xTrain.shape}")
print(f"Testing:  {xTest.shape}")

Training: (108, 6)
Testing:  (28, 6)


**Задания 2-3.** Реализовать баггинг и бустинг.

In [5]:
pd.set_option('display.max_columns', None)
def compTable(y_pred, y_fact):
    size = 25
    df = pd.DataFrame({
            'Predicted': np.array(y_pred[:size]).flatten(),
            'Factual': np.array(y_fact[:size]).flatten()
        })
    df = df.transpose()
    display(df)
    
def predResult(model, yTrain, yTest):
    yPredTrain = model.predict(xTrain)
    yPredTest = model.predict(xTest)
    print("Training data")
    print('F1 score:', f1_score(y_pred=yPredTrain, y_true=yTrain, average='macro'))
    compTable(yPredTrain, yTrain)
    print("Testing data")
    print('F1 score:', f1_score(y_pred=yPredTest, y_true=yTest, average='macro'))
    compTable(yPredTest, yTest)

def doBagging(xTrain, yTrain) -> RandomForestClassifier:
    random_forest = RandomForestClassifier(random_state=0)
    params_grid = {
        'max_depth':         [12, 18],
        'min_samples_leaf':  [3, 10],
        'min_samples_split': [6, 12]
    }
    gs = GridSearchCV(estimator=random_forest, param_grid=params_grid, scoring='f1_macro', cv=4)
    gs.fit(xTrain, yTrain)
    best_model = gs.best_estimator_
    return best_model

def doBoosting(xTrain, yTrain) -> CatBoostClassifier:
    model_catboost = CatBoostClassifier(iterations=5000, task_type='GPU', devices='0', metric_period=1000, random_state=0)
    model_catboost.fit(xTrain, yTrain)
    return model_catboost

In [6]:
best_model = doBagging(xTrain, yTrain)

In [7]:
print('--- Bagging results ---')
print('Best model: ', best_model)
predResult(best_model, yTrain, yTest)

--- Bagging results ---
Best model:  RandomForestClassifier(max_depth=12, min_samples_leaf=3, min_samples_split=12,
                       random_state=0)
Training data
F1 score: 0.8803934571175951


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
Predicted,0,2,2,2,0,0,2,0,0,0,2,1,0,2,0,0,1,1,2,1,2,0,1,2,0
Factual,0,2,2,2,0,0,2,0,0,0,2,1,0,2,0,0,1,1,2,1,2,0,1,2,0


Testing data
F1 score: 0.7778228532792427


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
Predicted,2,2,2,0,2,1,2,2,2,2,1,1,1,2,0,0,2,0,0,1,2,2,1,1,0
Factual,2,2,2,0,2,1,2,2,2,2,1,1,1,2,2,0,1,0,0,1,1,2,1,2,1


In [8]:
model_catboost = doBoosting(xTrain, yTrain)

Learning rate set to 0.010786
0:	learn: 1.0913237	total: 19.7ms	remaining: 1m 38s
1000:	learn: 0.1166576	total: 7.72s	remaining: 30.8s
2000:	learn: 0.0560407	total: 15.8s	remaining: 23.6s
3000:	learn: 0.0352529	total: 23.9s	remaining: 15.9s
4000:	learn: 0.0251624	total: 31.8s	remaining: 7.93s
4999:	learn: 0.0194388	total: 40.2s	remaining: 0us


In [9]:
print('--- Boosting results ---')
predResult(model_catboost, yTrain, yTest)

--- Boosting results ---
Training data
F1 score: 1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
Predicted,0,2,2,2,0,0,2,0,0,0,2,1,0,2,0,0,1,1,2,1,2,0,1,2,0
Factual,0,2,2,2,0,0,2,0,0,0,2,1,0,2,0,0,1,1,2,1,2,0,1,2,0


Testing data
F1 score: 0.6984126984126985


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
Predicted,2,2,2,1,2,1,2,2,2,2,1,1,1,2,1,1,2,0,0,1,1,2,1,1,0
Factual,2,2,2,0,2,1,2,2,2,2,1,1,1,2,2,0,1,0,0,1,1,2,1,2,1
