In [1]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier, Pool

In [2]:
data = pd.read_csv(
    "../data/external/public_train_features.csv", index_col="row_id"
)
data.drop(
    columns=[
        "timestamp",
        "timestamp_lag_1",
        "timestamp_lag_2",
        "timestamp_lag_3",
        "timestamp_lag_4",
    ],
    inplace=True,
)
data.fillna(value=0, inplace=True)

In [3]:
lst_to_norm = [
    "diff_time_gate_lag_1",
    "diff_time_gate_lag_2",
    "diff_time_gate_lag_3",
    "diff_time_gate_lag_4",
    "minute",
    "second",
    "day",
    "time_to_sec",
    "dayweek",
    "minute_lag_1",
    "minute_lag_2",
    "minute_lag_3",
    "minute_lag_4",
    "second_lag_1",
    "second_lag_2",
    "second_lag_3",
    "second_lag_4",
    "day_lag_1",
    "day_lag_2",
    "day_lag_3",
    "day_lag_4",
    "time_to_sec_lag_1",
    "time_to_sec_lag_2",
    "time_to_sec_lag_3",
    "time_to_sec_lag_4",
    "dayweek_lag_1",
    "dayweek_lag_2",
    "dayweek_lag_3",
    "dayweek_lag_4",
    "diff_time_to_sec_2",
    "diff_time_to_sec_3",
    "diff_time_to_sec_4",
]

In [4]:
data[lst_to_norm].describe()

Unnamed: 0,diff_time_gate_lag_1,diff_time_gate_lag_2,diff_time_gate_lag_3,diff_time_gate_lag_4,minute,second,day,time_to_sec,dayweek,minute_lag_1,...,time_to_sec_lag_2,time_to_sec_lag_3,time_to_sec_lag_4,dayweek_lag_1,dayweek_lag_2,dayweek_lag_3,dayweek_lag_4,diff_time_to_sec_2,diff_time_to_sec_3,diff_time_to_sec_4
count,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,...,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0,44643.0
mean,5649.55,11146.24,16056.17,20888.82,29.18247,29.468136,15.715431,52937.300943,2.157561,29.181484,...,52934.117958,52932.526645,52930.935354,2.157471,2.157382,2.157292,2.157203,1.706203,2.558453,3.410411
std,64929.05,92722.0,107680.8,120798.6,17.269915,17.319715,9.086933,13117.605029,1.500084,17.270325,...,13121.829271,13123.94084,13126.051908,1.500093,1.500103,1.500112,1.500121,3910.069039,4784.678004,5525.638387
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3172.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-81630.0,-81624.0,-81623.0
25%,2.0,501.0,923.0,1531.0,14.0,15.0,7.0,41104.0,1.0,14.0,...,41102.0,41102.0,41102.0,1.0,1.0,1.0,1.0,19.0,37.0,76.0
50%,289.0,1353.0,2248.0,3271.0,29.0,29.0,16.0,52045.0,2.0,29.0,...,52043.0,52041.0,52041.0,2.0,2.0,2.0,2.0,52.0,155.0,286.0
75%,1404.0,3356.0,5198.0,7286.5,44.0,44.0,23.0,64807.0,3.0,44.0,...,64806.0,64806.0,64806.0,3.0,3.0,3.0,3.0,298.0,531.0,741.0
max,7077913.0,7180401.0,8545990.0,12360090.0,59.0,59.0,31.0,85722.0,6.0,59.0,...,85722.0,85722.0,85722.0,6.0,6.0,6.0,6.0,52805.0,52807.0,55152.0


In [5]:
target = pd.read_csv("../data/interim/target.csv", index_col="row_id")

In [6]:
data_train = data.join(target, on=None, how="inner").iloc[4:]

In [7]:
data_test = data.loc[37518:]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    data_train.drop(columns=["user_id"]),
    data_train["user_id"],
    random_state=15,
    test_size=0.25,
)

In [9]:
# pipe = make_pipeline(
#     StandardScaler(), LogisticRegression(n_jobs=-3, solver="saga", max_iter=200, multi_class="ovr", C=0.01)
# )
# pipe.fit(X_train, y_train)

# print(pipe.score(X_test, y_test) * 100)
# print(classification_report(y_test, pipe.predict(X_test)))
# # 17.304616696876

In [14]:
clf = CatBoostClassifier(
    thread_count=4,
    iterations=2000,
    early_stopping_rounds=50,
    eval_metric="TotalF1",
    use_best_model=True,
    learning_rate=0.01,
)
clf.fit(X_train, y_train, verbose=False, eval_set=(X_test, y_test))

<catboost.core.CatBoostClassifier at 0x7f1d825d59d0>

In [11]:
stop

NameError: name 'stop' is not defined

In [15]:
data_test["target"] = clf.predict(data_test)
data_test["target"]

row_id
37518    18
37519    18
37520     3
37521     3
37522    15
         ..
44638    37
44639     6
44640    19
44641    54
44642    54
Name: target, Length: 7125, dtype: int64

In [16]:
data_out = data_test["target"]
data_out.to_csv("../data/processed/sample_submission_2.sv")