In [1]:
import pandas as pd
import numpy as np

from scipy.stats import mode
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

In [2]:
train = pd.read_csv('../open/train.csv')
test = pd.read_csv('../open/test.csv')

In [3]:
train.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [4]:
train = train.fillna(0)
test = test.fillna(0)

In [5]:
cat_feats = ['LINE','PRODUCT_CODE']
other_feats = list(set(train.columns)-set(cat_feats))

In [6]:
dummies_col = []
for c in cat_feats:
    df = pd.get_dummies(train[c])
    train[df.columns] = df
    train = train.drop(c, axis=1)
    df = pd.get_dummies(test[c])
    test[df.columns] = df
    test = test.drop(c, axis=1)
    dummies_col.extend(df.columns)
other_feats.extend(dummies_col)

In [7]:
train_X = train.drop(['PRODUCT_ID','Y_Class', 'Y_Quality','TIMESTAMP'], axis = 1)
train_y = train['Y_Class']

test = test.drop(['PRODUCT_ID','TIMESTAMP'], axis = 1)

In [8]:
def get_values(value):
    return value.values.reshape(-1, 1)

for col in train_X.columns:
    if col not in dummies_col:
        scaler = StandardScaler()
        train_X[col] = scaler.fit_transform(get_values(train_X[col]))
        if col in test.columns:
            test[col] = scaler.transform(get_values(test[col]))
            
le = LabelEncoder()
for col in dummies_col:    
    train_X[col] = le.fit_transform(train_X[col])
    if col in test.columns:
        test[col] = le.transform(test[col])

In [9]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(train_X, train_y)

5

In [10]:
def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True)

In [11]:
models = []
scores = []
for i, (train_index, test_index) in enumerate(skf.split(train_X, train_y)):
    clf = CatBoostClassifier(objective='MultiClass',task_type='GPU')
    clf.fit(train_X.iloc[train_index], train_y.iloc[train_index], verbose=100)
    pred = clf.predict(train_X.iloc[test_index])
    score = f1_score(train_y[test_index], pred, average='macro')
    models.append(clf)
    scores.append(score)
print(scores)    
print(np.array(scores).mean())

Learning rate set to 0.055832
0:	learn: 1.0528110	total: 44.1ms	remaining: 44s
100:	learn: 0.4032543	total: 3.8s	remaining: 33.8s
200:	learn: 0.2778489	total: 7.43s	remaining: 29.5s
300:	learn: 0.2064500	total: 11.1s	remaining: 25.7s
400:	learn: 0.1636086	total: 14.6s	remaining: 21.8s
500:	learn: 0.1314098	total: 18.1s	remaining: 18s
600:	learn: 0.1093075	total: 21.7s	remaining: 14.4s
700:	learn: 0.0907222	total: 25.3s	remaining: 10.8s
800:	learn: 0.0781684	total: 30.2s	remaining: 7.5s
900:	learn: 0.0668750	total: 35s	remaining: 3.84s
999:	learn: 0.0584461	total: 39.2s	remaining: 0us
Learning rate set to 0.055832
0:	learn: 1.0511214	total: 276ms	remaining: 4m 36s
100:	learn: 0.3879940	total: 4.84s	remaining: 43.1s
200:	learn: 0.2644260	total: 9.3s	remaining: 37s
300:	learn: 0.1948812	total: 13s	remaining: 30.3s
400:	learn: 0.1505706	total: 16.8s	remaining: 25.1s
500:	learn: 0.1212036	total: 20.6s	remaining: 20.5s
600:	learn: 0.0999866	total: 24.5s	remaining: 16.3s
700:	learn: 0.0841625

In [12]:
preds = np.array([])
for m in models:
    preds = np.append(preds,m.predict(test))
preds = mode(preds.reshape((-1,5)), axis=1).mode

In [13]:
submit = pd.read_csv('../open/sample_submission.csv')
submit['Y_Class'] = preds
submit.to_csv('../catboost.csv', index=False)