In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [2]:
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test.csv')

In [3]:
train.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [4]:
train = train.fillna(0)
test = test.fillna(0)

In [5]:
cat_feats = ['LINE','PRODUCT_CODE']
other_feats = list(set(train.columns)-set(cat_feats))

In [6]:
dummies_col = []
for c in cat_feats:
    df = pd.get_dummies(train[c])
    train[df.columns] = df
    train = train.drop(c, axis=1)
    df = pd.get_dummies(test[c])
    test[df.columns] = df
    test = test.drop(c, axis=1)
    dummies_col.extend(df.columns)
other_feats.extend(dummies_col)

In [7]:
train_X = train.drop(['PRODUCT_ID','Y_Class', 'Y_Quality','TIMESTAMP'], axis = 1)
train_y = train['Y_Class']

test = test.drop(['PRODUCT_ID','TIMESTAMP'], axis = 1)

In [8]:
def get_values(value):
    return value.values.reshape(-1, 1)

for col in train_X.columns:
    if col not in dummies_col:
        scaler = StandardScaler()
        train_X[col] = scaler.fit_transform(get_values(train_X[col]))
        if col in test.columns:
            test[col] = scaler.transform(get_values(test[col]))
            
le = LabelEncoder()
for col in dummies_col:    
    train_X[col] = le.fit_transform(train_X[col])
    if col in test.columns:
        test[col] = le.transform(test[col])

In [9]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(train_X, train_y)

5

In [None]:
from sklearn.metrics import f1_score

In [37]:
models = []
scores = []
for i, (train_index, test_index) in enumerate(skf.split(train_X, train_y)):
    clf = lgb.LGBMClassifier(objective='multiclass')
    clf.fit(train_X.iloc[train_index], train_y.iloc[train_index])
    pred = clf.predict(train_X.iloc[test_index])
    score = f1_score(train_y[test_index], pred, average='macro')
    models.append(clf)
    scores.append(score)
print(scores)    
print(np.array(scores).mean())

[0.4573327164907264, 0.4704630575417092, 0.3848665870171247, 0.3806751487910908, 0.367784352399737]
0.4122243724480776


In [42]:
def softmax(a):
    c = np.max(a)
    exp_a = np.exp(a-c)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    return y

In [43]:
softmax([round(x, 2) for x in scores])

array([0.2096342 , 0.21174106, 0.19351676, 0.19351676, 0.19159123])

In [28]:
from sklearn.ensemble import VotingClassifier

In [44]:
estimators = []
for i, m in enumerate(models):
    estimators.append((str(i), m))

eclf2 = VotingClassifier(estimators=estimators,
                         weights=softmax([round(x, 2) for x in scores]),
                         voting='soft')
eclf2 = eclf2.fit(train_X, train_y)

In [45]:
preds = eclf2.predict(test)

In [46]:
submit = pd.read_csv('open/sample_submission.csv')
submit['Y_Class'] = preds
submit.to_csv('./baseline_submission.csv', index=False)