In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score

from tqdm.notebook import tqdm

In [2]:
train = pd.read_csv("/kaggle/input/petfinder-adoption-prediction/train/train.csv", index_col="PetID") 
test = pd.read_csv("/kaggle/input/petfinder-adoption-prediction/test/test.csv", index_col="PetID")
ss = pd.read_csv("/kaggle/input/petfinder-adoption-prediction/test/sample_submission.csv", index_col="PetID")

label = "AdoptionSpeed"
X_train = train.drop(label, axis=1).select_dtypes(exclude="O")
y_train = train[label]

X_test = test[X_train.columns]

In [3]:
valid_preds = []
test_preds = []

for train_index, valid_index in tqdm(KFold(n_splits=5, shuffle=True).split(X_train), total=5):
    Xt = X_train.iloc[train_index]
    yt = y_train.iloc[train_index]
    Xv = X_train.iloc[valid_index]
    
    model = LGBMClassifier()
    model.fit(Xt, yt)

    valid_preds.append(
        pd.Series(model.predict(Xv), index=Xv.index, name="AdoptionSpeed")
    )
    test_preds.append(
        pd.Series(model.predict(X_test), index=X_test.index, name="AdoptionSpeed")
    ) 

valid_preds = pd.concat(valid_preds)

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 11994, number of used features: 19
[LightGBM] [Info] Start training from score -3.599148
[LightGBM] [Info] Start training from score -1.589135
[LightGBM] [Info] Start training from score -1.320631
[LightGBM] [Info] Start training from score -1.523525
[LightGBM] [Info] Start training from score -1.260043
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 11994, number of used features: 19
[LightGBM] [Info] Start tr

In [4]:
cohen_kappa_score(y_train, valid_preds.loc[y_train.index], weights="quadratic")

0.3540575587335463

In [5]:
pd.concat(test_preds, axis=1).median(axis=1).to_csv("submission.csv")

In [6]:
!head submission.csv

PetID,0
e2dfc2935,4.0
f153b465f,3.0
3c90f3f54,2.0
e02abc8a3,4.0
09f0df7d1,4.0
0487529d4,2.0
bae7c4b1c,4.0
548bcf206,2.0
0f82cea1e,4.0
