In [1]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier, Pool

In [2]:
data = pd.read_csv(
    "../data/external/public_train_features_5.csv", index_col="row_id"
)

In [3]:
data.drop(
    columns=[
        "timestamp",
        "timestamp_lag_1",
        "timestamp_lag_2",
        "timestamp_lag_3",
        "timestamp_lag_4",
        "timestamp_lag_5",
    ],
    inplace=True,
)
data.fillna(value=0, inplace=True)

In [4]:
lst_to_norm = [
    "diff_time_gate_lag_1",
    "diff_time_gate_lag_2",
    "diff_time_gate_lag_3",
    "diff_time_gate_lag_4",
    "diff_time_gate_lag_5",
    "minute",
    "second",
    "day",
    "time_to_sec",
    "dayweek",
    "minute_lag_1",
    "minute_lag_2",
    "minute_lag_3",
    "minute_lag_4",
    "second_lag_1",
    "second_lag_2",
    "second_lag_3",
    "second_lag_4",
    "day_lag_1",
    "day_lag_2",
    "day_lag_3",
    "day_lag_4",
    "time_to_sec_lag_1",
    "time_to_sec_lag_2",
    "time_to_sec_lag_3",
    "time_to_sec_lag_4",
    "dayweek_lag_1",
    "dayweek_lag_2",
    "dayweek_lag_3",
    "dayweek_lag_4",
    "diff_time_to_sec_2",
    "diff_time_to_sec_3",
    "diff_time_to_sec_4",
]

In [5]:
target = pd.read_csv("../data/interim/target.csv", index_col="row_id")

In [6]:
data_train = data.join(target, on=None, how="inner").iloc[5:]

In [7]:
data_test = data.loc[37518:]

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(
#     data_train.drop(columns=["user_id"]),
#     data_train["user_id"],
#     random_state=15,
#     test_size=0.25,
# )

In [9]:
# pipe = make_pipeline(
#     StandardScaler(), LogisticRegression(n_jobs=-3, solver="saga", max_iter=200, multi_class="ovr", C=0.01)
# )
# pipe.fit(X_train, y_train)

# print(pipe.score(X_test, y_test) * 100)
# print(classification_report(y_test, pipe.predict(X_test)))
# # 17.304616696876

In [10]:
clf = CatBoostClassifier(
    thread_count=4,
    iterations=5000,
    early_stopping_rounds=100,
    eval_metric="TotalF1",
    # use_best_model=True,
    learning_rate=0.01,
)
clf.fit(
    data_train.drop(columns=["user_id"]),
    data_train["user_id"],
    verbose=200
    # , eval_set=(X_test, y_test)
)

0:	learn: 0.0447301	total: 1.04s	remaining: 1h 26m 58s
200:	learn: 0.1428591	total: 3m 16s	remaining: 1h 18m 2s
400:	learn: 0.1797048	total: 6m 30s	remaining: 1h 14m 41s
600:	learn: 0.2078786	total: 9m 41s	remaining: 1h 10m 57s
800:	learn: 0.2320587	total: 12m 52s	remaining: 1h 7m 31s
1000:	learn: 0.2563073	total: 16m 5s	remaining: 1h 4m 16s
1200:	learn: 0.2799818	total: 19m 20s	remaining: 1h 1m 10s
1400:	learn: 0.3028803	total: 22m 32s	remaining: 57m 55s
1600:	learn: 0.3243419	total: 25m 44s	remaining: 54m 38s
1800:	learn: 0.3460378	total: 28m 56s	remaining: 51m 24s
2000:	learn: 0.3658754	total: 32m 10s	remaining: 48m 13s
2200:	learn: 0.3852815	total: 35m 23s	remaining: 45m 1s
2400:	learn: 0.4034741	total: 38m 37s	remaining: 41m 48s
2600:	learn: 0.4204174	total: 41m 52s	remaining: 38m 37s
2800:	learn: 0.4372748	total: 45m 5s	remaining: 35m 23s
3000:	learn: 0.4535166	total: 48m 17s	remaining: 32m 10s
3200:	learn: 0.4689851	total: 51m 32s	remaining: 28m 57s
3400:	learn: 0.4836153	total:

<catboost.core.CatBoostClassifier at 0x7f4bac632040>

In [14]:
clf.best_iteration_

8419

In [15]:
print(classification_report(y_test, clf.predict(X_test)))
#     accuracy                           0.34      9379
#    macro avg       0.41      0.29      0.31      9379
# weighted avg       0.36      0.34      0.33      9379

              precision    recall  f1-score   support

           0       0.33      0.23      0.27       315
           1       0.40      0.39      0.40       301
           2       1.00      0.25      0.40        12
           3       0.40      0.53      0.45       236
           4       0.00      0.00      0.00         1
           5       0.50      0.33      0.40         3
           6       0.34      0.25      0.29       477
           7       0.60      0.50      0.55        12
           8       1.00      0.75      0.86         8
           9       0.27      0.31      0.29       242
          10       1.00      0.40      0.57         5
          11       0.26      0.33      0.29       321
          12       0.33      0.55      0.41       550
          14       0.66      0.70      0.68       187
          15       0.38      0.47      0.42       436
          17       0.43      0.52      0.47       155
          18       0.52      0.39      0.44       391
          19       0.36    

In [13]:
stop

NameError: name 'stop' is not defined

In [11]:
data_test["target"] = clf.predict(data_test)
data_test["target"]

row_id
37518    18
37519    18
37520    15
37521    15
37522    15
         ..
44638    37
44639    37
44640    49
44641    49
44642    49
Name: target, Length: 7125, dtype: int64

In [12]:
data_out = data_test["target"]
data_out.to_csv("../data/processed/sample_submission_2.sv")