In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from phik.report import plot_correlation_matrix
from lightgbm import LGBMClassifier, plot_importance

In [55]:
df = sns.load_dataset('penguins')
df.dropna(inplace=True)

In [56]:
num_classes = len(df['species'].unique())

In [57]:
X = df.drop('species', axis=1)
y = df['species']

In [58]:
X = pd.get_dummies(X)
X = X.replace(True, 1).replace(False, 0)

In [82]:
dct_ans = {0: 'Adelie', 1: 'Chinstrap', 2: 'Gentoo'}

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [60]:
boostings = ["gbdt", "dart", "goss"]
metr = []
for name in boostings:
    clf = LGBMClassifier(
        objective="multiclass",
        n_estimators=10,
        learning_rate=0.1,
        num_class=num_classes,
        reg_alpha=0.5,
        reg_lambda=0.3,
        seed=42,
        boosting_type=name,
    )
    clf.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
    )
    test_preds = clf.predict(X_test)
    metr.append([name, f1_score(y_test, test_preds, average='macro')])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 9
[LightGBM] [Info] Start training from score -0.874572
[LightGBM] [Info] Start training from score -1.600509
[LightGBM] [Info] Start training from score -0.964521
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 223, number of used features: 9
[LightGBM] [Info] Start training from score -0.874572
[LightGBM] [Info] Start training from score -1.600509
[LightGBM] [Info] Start training from score -0.964521
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000034 seconds.
Yo

In [61]:
lgbm_class = clf

In [62]:
train_dataset = catboost.Pool(X_train, y_train)
test_dataset = catboost.Pool(X_test, y_test)

In [63]:
params = {
    "n_estimators": 1500,
    "learning_rate": 0.03,
    "depth": 3,
    "use_best_model": True,
    "border_count": 64,
    "l2_leaf_reg": 1,
    "bagging_temperature": 2,
    "rsm": 0.5,
    "loss_function": "MultiClass", 
    "auto_class_weights" : 'Balanced',
    "random_state": 22,
    "custom_metric": ["Precision", "Recall", "F1"],
}

In [64]:
model_class = catboost.CatBoostClassifier(**params)

In [65]:
model_class.fit(train_dataset, eval_set=test_dataset)

0:	learn: 1.0576151	test: 1.0610454	best: 1.0610454 (0)	total: 439us	remaining: 659ms
1:	learn: 1.0193525	test: 1.0223154	best: 1.0223154 (1)	total: 980us	remaining: 734ms
2:	learn: 0.9771292	test: 0.9805931	best: 0.9805931 (2)	total: 1.6ms	remaining: 799ms
3:	learn: 0.9771291	test: 0.9805860	best: 0.9805860 (3)	total: 1.88ms	remaining: 704ms
4:	learn: 0.9428670	test: 0.9495184	best: 0.9495184 (4)	total: 2.27ms	remaining: 679ms
5:	learn: 0.9118928	test: 0.9213033	best: 0.9213033 (5)	total: 2.65ms	remaining: 660ms
6:	learn: 0.8803317	test: 0.8862516	best: 0.8862516 (6)	total: 3.11ms	remaining: 663ms
7:	learn: 0.8450981	test: 0.8534118	best: 0.8534118 (7)	total: 3.51ms	remaining: 655ms
8:	learn: 0.8138607	test: 0.8230632	best: 0.8230632 (8)	total: 3.9ms	remaining: 646ms
9:	learn: 0.7856815	test: 0.7944717	best: 0.7944717 (9)	total: 4.28ms	remaining: 638ms
10:	learn: 0.7598106	test: 0.7666781	best: 0.7666781 (10)	total: 4.95ms	remaining: 669ms
11:	learn: 0.7349824	test: 0.7404071	best: 0.

<catboost.core.CatBoostClassifier at 0x1f66fd92590>

In [66]:
print(f"Catboost: {f1_score(y_test, model_class.predict(X_test), average='macro')}")
print(f"LGBM: {f1_score(y_test, lgbm_class.predict(X_test), average='macro')}")

Catboost: 0.979217938200711
LGBM: 0.9247400416931931


In [77]:
y_blend = 0.8 * model_class.predict_proba(X_test) + 0.2 * lgbm_class.predict_proba(X_test)  



In [78]:
y_blend

array([[0.95712222, 0.01432111, 0.02855667],
       [0.02534145, 0.01430281, 0.96035574],
       [0.03153154, 0.86192288, 0.10654558],
       [0.02532856, 0.01431302, 0.96035842],
       [0.94218509, 0.03162674, 0.02618816],
       [0.02547583, 0.01442266, 0.96010151],
       [0.94727968, 0.02655474, 0.02616557],
       [0.02546926, 0.01442275, 0.96010799],
       [0.94370681, 0.01882099, 0.03747219],
       [0.95943588, 0.01863081, 0.02193331],
       [0.9120664 , 0.06049216, 0.02744144],
       [0.94098466, 0.02218011, 0.03683522],
       [0.04893197, 0.01323904, 0.93782898],
       [0.96323473, 0.01413759, 0.02262767],
       [0.02866525, 0.87023371, 0.10110104],
       [0.96332887, 0.01408423, 0.0225869 ],
       [0.02542232, 0.01431865, 0.96025903],
       [0.74463735, 0.22773505, 0.0276276 ],
       [0.03523422, 0.85561026, 0.10915553],
       [0.9401544 , 0.03296566, 0.02687993],
       [0.02552231, 0.01446077, 0.96001692],
       [0.03839551, 0.93792272, 0.02368178],
       [0.

In [79]:
y_blend = np.argmax(y_blend, axis=1)

In [84]:
ans_blend = [dct_ans[x] for x in y_blend]

In [85]:
print(f1_score(y_test, ans_blend, average='macro'))

0.979217938200711


<h1>Работа</h1>

In [5]:
n = int(input())
s = input().split()
scores = []
for i in range(n):
    scores.append(float(s[i]))

## Your code here ...
weights = [round(x / sum(scores), 7) for x in scores]

In [6]:
weights

[0.25, 0.4166667, 0.3333333]

In [9]:
df_w = pd.read_csv('make_hard_blend.csv')

In [17]:
df_w['target_class'] = df_w.drop('target_class', axis=1).mode(axis=1)[0]

In [19]:
df_w[['car_id', 'target_class']].to_csv('ans.csv', index=False)