In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier

In [None]:
train = pd.read_csv('/content/drive/MyDrive/NTO_Большие_данные_и_машинное_обучение /train.tsv', sep='\t')
test = pd.read_csv('/content/drive/MyDrive/NTO_Большие_данные_и_машинное_обучение /test.tsv', sep='\t')

In [None]:
train

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x992,x993,x994,x995,x996,x997,x998,x999,x1000,y
0,1,80,?,-10,-20,-10,150,?,-510,?,...,20,60,-10,50,?,-50,?,-3860,-270,P
1,2,70,20,-20,?,-10,?,360,-440,-400,...,20,?,-90,50,?,-60,0,-3250,-750,P
2,3,80,?,20,-10,10,?,190,-500,880,...,10,?,?,60,-1310,-40,0,-3410,-630,N
3,4,60,?,?,-40,?,?,580,?,-200,...,20,60,40,60,?,-50,?,-3020,?,P
4,5,70,?,?,-10,0,230,?,30,390,...,20,?,-60,?,-1410,-50,-40,-3500,-420,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,70,80,?,0,0,180,180,-600,590,...,20,60,?,40,-1320,-40,-20,-3510,?,N
19996,19997,70,90,?,0,0,140,540,-380,?,...,10,?,-40,50,?,?,-20,-3420,-510,N
19997,19998,60,?,?,40,10,180,320,?,?,...,10,?,?,?,-1050,?,?,-3250,-520,N
19998,19999,?,?,-30,0,?,200,300,?,?,...,10,70,?,40,?,-60,-40,-3500,-850,N


In [None]:
test

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x991,x992,x993,x994,x995,x996,x997,x998,x999,x1000
0,20001,70,90,-100,?,-10,?,260,-370,-280,...,?,?,10,140,50,?,-60,?,-3200,-950
1,20002,60,40,-40,-20,-10,110,330,?,?,...,?,20,40,?,?,-1220,-50,?,-3560,-920
2,20003,80,60,?,-60,0,?,360,-540,?,...,?,20,?,100,?,-1170,-50,-10,?,?
3,20004,80,90,40,10,?,190,490,-380,?,...,?,20,20,-40,40,?,?,-40,?,?
4,20005,80,70,40,?,?,70,470,-340,630,...,50,10,30,20,40,-1700,-60,-20,?,-420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12340,32341,60,?,?,20,?,240,430,?,340,...,50,10,70,50,50,-1390,?,-10,-2890,-490
12341,32342,?,50,-30,0,20,80,360,?,-180,...,?,?,?,?,50,?,-50,?,-3010,?
12342,32343,70,?,30,?,10,170,190,-570,310,...,50,10,20,-150,?,?,-50,?,-3120,-710
12343,32344,70,20,-60,-40,-30,210,560,?,-540,...,?,?,?,-10,60,-1630,?,?,-3700,-570


In [None]:
# Замена символов '?' на NaN
train.replace('?', np.nan, inplace=True)
test.replace('?', np.nan, inplace=True)

In [None]:
# Разделение тренировочных данных на признаки и целевую переменную
X_train = train.drop(columns=['id', 'y'])
y_train = train['y'].map({'P': 1, 'N': 0})  # Преобразование классов в 1 и 0

X_test = test.drop(columns=['id'])

In [None]:
# Преобразование всех признаков в числовой формат
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

In [None]:
# Обработка пропусков с помощью SimpleImputer (можно выбрать 'mean', 'median')
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
X_train

array([[   80.,    60.,   -10., ...,   -20., -3860.,  -270.],
       [   70.,    20.,   -20., ...,     0., -3250.,  -750.],
       [   80.,    60.,    20., ...,     0., -3410.,  -630.],
       ...,
       [   60.,    60.,    10., ...,   -20., -3250.,  -520.],
       [   70.,    60.,   -30., ...,   -40., -3500.,  -850.],
       [   80.,    70.,    10., ...,   -20., -3490.,  -740.]])

In [None]:
X_test

array([[   70.,    90.,  -100., ...,   -20., -3200.,  -950.],
       [   60.,    40.,   -40., ...,   -20., -3560.,  -920.],
       [   80.,    60.,    10., ...,   -10., -3250.,  -670.],
       ...,
       [   70.,    60.,    30., ...,   -20., -3120.,  -710.],
       [   70.,    20.,   -60., ...,   -20., -3700.,  -570.],
       [   70.,    40.,   -30., ...,   -20., -3250.,  -850.]])

In [None]:
# Стандартизация данных
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

array([[ 1.46549061, -0.01639453, -0.49788267, ..., -0.08870532,
        -1.50484236,  1.57778389],
       [-0.2017342 , -1.21966303, -0.7892807 , ...,  1.53000486,
        -0.02597473, -0.38124041],
       [ 1.46549061, -0.01639453,  0.37631141, ...,  1.53000486,
        -0.41387444,  0.10851566],
       ...,
       [-1.86895902, -0.01639453,  0.08491339, ..., -0.08870532,
        -0.02597473,  0.55745873],
       [-0.2017342 , -0.01639453, -1.08067872, ..., -1.70741549,
        -0.63206802, -0.78937048],
       [ 1.46549061,  0.28442259,  0.08491339, ..., -0.08870532,
        -0.60782429, -0.34042741]])

In [None]:
X_test

array([[-0.2017342 ,  0.88605684, -3.12046492, ..., -0.08870532,
         0.09524392, -1.19750054],
       [-1.86895902, -0.61802878, -1.37207675, ..., -0.08870532,
        -0.77753041, -1.07506152],
       [ 1.46549061, -0.01639453,  0.08491339, ...,  0.72064977,
        -0.02597473, -0.05473636],
       ...,
       [-0.2017342 , -0.01639453,  0.66770944, ..., -0.08870532,
         0.28919378, -0.21798839],
       [-0.2017342 , -1.21966303, -1.95487281, ..., -0.08870532,
        -1.11694265,  0.3533937 ],
       [-0.2017342 , -0.61802878, -1.08067872, ..., -0.08870532,
        -0.02597473, -0.78937048]])

In [None]:
# Инициализация модели
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    loss_function='Logloss',
    eval_metric='F1',
    verbose=100
)

In [None]:
# Обучение модели
model.fit(X_train, y_train)

0:	learn: 0.7265759	total: 527ms	remaining: 8m 46s
100:	learn: 0.8515729	total: 22.3s	remaining: 3m 18s
200:	learn: 0.8751093	total: 51.3s	remaining: 3m 23s
300:	learn: 0.8866051	total: 1m 9s	remaining: 2m 40s
400:	learn: 0.8940052	total: 1m 25s	remaining: 2m 8s
500:	learn: 0.8999133	total: 1m 43s	remaining: 1m 43s
600:	learn: 0.9058109	total: 2m 1s	remaining: 1m 20s
700:	learn: 0.9123354	total: 2m 18s	remaining: 58.9s
800:	learn: 0.9168831	total: 2m 34s	remaining: 38.3s
900:	learn: 0.9207090	total: 2m 50s	remaining: 18.7s
999:	learn: 0.9246344	total: 3m 6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7b3749d24310>

In [None]:
# Предсказание на тестовом наборе данных
y_pred = model.predict(X_test)

# Преобразование предсказаний обратно в классы P/N
y_pred_class = ['P' if pred == 1 else 'N' for pred in y_pred]

In [None]:
# Создание файла с результатами
submission = pd.DataFrame({
    'id': test['id'],
    'y': y_pred_class
})

In [None]:
submission

Unnamed: 0,id,y
0,20001,P
1,20002,N
2,20003,N
3,20004,N
4,20005,N
...,...,...
12340,32341,P
12341,32342,N
12342,32343,N
12343,32344,N


In [None]:
# Сохранение файла
submission.to_csv('submission.tsv', sep='\t', index=False)