In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.metrics import f1_score

In [11]:
import pandas as pd
import numpy as np
from feature_engine.imputation import MeanMedianImputer
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder

from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [8]:
!pip install --ignore-installed feature_engine

Collecting feature_engine
  Using cached feature_engine-1.8.2-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting numpy>=1.18.2 (from feature_engine)
  Downloading numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas>=2.2.0 (from feature_engine)
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=1.4.0 (from feature_engine)
  Using cached scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.4.1 (from feature_engine)
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [44]:
train = pd.read_csv('/kaggle/input/train-nt1/train.tsv', sep='\t')

In [45]:
train.drop('id', inplace=True, axis=1)

In [46]:
train.replace({'?':np.nan}, inplace=True)
train[train.columns[:-1]] = train[train.columns[:-1]].astype('float')

In [47]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('y', axis=1), train['y'], test_size=0.2, random_state=0)

In [48]:
preprocessing_pipeline = Pipeline([
    # 1. Заполнение пропусков медианным значением для числовых признаков
    ('imputer', MeanMedianImputer(imputation_method='median', variables=train.columns[:-1].tolist())),
    
    # 2. Обработка выбросов методом Winsorizer для числовых признаков
    ('winsorizer', Winsorizer(capping_method='gaussian', tail='both', fold=3, variables=train.columns[:-1].tolist())),
])

In [49]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 17815 to 2732
Columns: 1000 entries, x1 to x1000
dtypes: float64(1000)
memory usage: 122.2 MB


In [50]:
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

In [53]:
model = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.1,
    l2_leaf_reg=3,
    random_seed=0,
    verbose=100,
    task_type='GPU',
    eval_metric='F1',
    
)

In [54]:
model.fit(X_train_processed, y_train, eval_set=(X_test_processed,y_test))

0:	learn: 0.7561027	test: 0.7448757	best: 0.7448757 (0)	total: 660ms	remaining: 10m 59s
100:	learn: 0.8999892	test: 0.8710247	best: 0.8716365 (72)	total: 20.1s	remaining: 2m 58s
200:	learn: 0.9278639	test: 0.8719081	best: 0.8742832 (156)	total: 39.4s	remaining: 2m 36s
300:	learn: 0.9433679	test: 0.8768212	best: 0.8773169 (299)	total: 58.6s	remaining: 2m 16s
400:	learn: 0.9547255	test: 0.8789080	best: 0.8798944 (341)	total: 1m 17s	remaining: 1m 56s
500:	learn: 0.9629471	test: 0.8788683	best: 0.8804588 (459)	total: 1m 37s	remaining: 1m 36s
600:	learn: 0.9702590	test: 0.8800354	best: 0.8804588 (459)	total: 1m 56s	remaining: 1m 17s
700:	learn: 0.9765218	test: 0.8819107	best: 0.8820150 (688)	total: 2m 15s	remaining: 57.8s
800:	learn: 0.9800213	test: 0.8820150	best: 0.8823009 (701)	total: 2m 34s	remaining: 38.4s
900:	learn: 0.9845109	test: 0.8837622	best: 0.8848539 (884)	total: 2m 53s	remaining: 19.1s
999:	learn: 0.9863246	test: 0.8849558	best: 0.8867257 (931)	total: 3m 12s	remaining: 0us
be

<catboost.core.CatBoostClassifier at 0x7cb483224ac0>

In [10]:
scores = cross_val_score(model, X_train_processed, y_train, cv=3, scoring='accuracy')

In [55]:
test = pd.read_csv('/kaggle/input/train-nt1/test.tsv', sep='\t')

In [56]:
test.head()

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x991,x992,x993,x994,x995,x996,x997,x998,x999,x1000
0,20001,70,90,-100,?,-10,?,260,-370,-280,...,?,?,10,140,50,?,-60,?,-3200,-950
1,20002,60,40,-40,-20,-10,110,330,?,?,...,?,20,40,?,?,-1220,-50,?,-3560,-920
2,20003,80,60,?,-60,0,?,360,-540,?,...,?,20,?,100,?,-1170,-50,-10,?,?
3,20004,80,90,40,10,?,190,490,-380,?,...,?,20,20,-40,40,?,?,-40,?,?
4,20005,80,70,40,?,?,70,470,-340,630,...,50,10,30,20,40,-1700,-60,-20,?,-420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12340,32341,60,?,?,20,?,240,430,?,340,...,50,10,70,50,50,-1390,?,-10,-2890,-490
12341,32342,?,50,-30,0,20,80,360,?,-180,...,?,?,?,?,50,?,-50,?,-3010,?
12342,32343,70,?,30,?,10,170,190,-570,310,...,50,10,20,-150,?,?,-50,?,-3120,-710
12343,32344,70,20,-60,-40,-30,210,560,?,-540,...,?,?,?,-10,60,-1630,?,?,-3700,-570


In [57]:
test.replace({'?':np.nan}, inplace=True)
test[train.columns[:-1]] = test[train.columns[:-1]].astype('float')

In [59]:
test.drop('id', inplace=True, axis=1)

In [60]:
test = preprocessing_pipeline.transform(test)

In [61]:
y_pred= model.predict(test)

In [63]:
sample = pd.DataFrame()

In [64]:
sample['y'] = y_pred

In [65]:
sample.head()

Unnamed: 0,y
0,P
1,N
2,N
3,N
4,N


In [66]:
sample.to_csv('output.tsv', sep='\t', index=False)