In [1]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report
from time import time

In [2]:
# Прочитаем данные, переведем timestamp в формат datetime64, изначально он считывается как object
data = pd.read_csv("../data/raw/train.csv", index_col="row_id")
data["timestamp"] = data["timestamp"].astype("datetime64[ns]")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    37518 non-null  int64         
 1   timestamp  37518 non-null  datetime64[ns]
 2   gate_id    37518 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.1 MB


In [3]:
# очитска данных
data = data[(data["gate_id"] != 0)]
data = data[(data["gate_id"] != 16)]
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37512 entries, 0 to 37517
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    37512 non-null  int64         
 1   timestamp  37512 non-null  datetime64[ns]
 2   gate_id    37512 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.1 MB


In [4]:
data["timestamp_lag_1"] = data.groupby("gate_id")["timestamp"].shift(1)
data["timestamp_lag_2"] = data.groupby("gate_id")["timestamp"].shift(2)
data["timestamp_lag_3"] = data.groupby("gate_id")["timestamp"].shift(3)
data["timestamp_lag_4"] = data.groupby("gate_id")["timestamp"].shift(4)

In [5]:
data["diff_time_gate_lag_1"] = (
    data["timestamp"] - data["timestamp_lag_1"]
) / pd.Timedelta(seconds=1)
data["diff_time_gate_lag_2"] = (
    data["timestamp"] - data["timestamp_lag_2"]
) / pd.Timedelta(seconds=1)
data["diff_time_gate_lag_3"] = (
    data["timestamp"] - data["timestamp_lag_3"]
) / pd.Timedelta(seconds=1)
data["diff_time_gate_lag_4"] = (
    data["timestamp"] - data["timestamp_lag_4"]
) / pd.Timedelta(seconds=1)

data.fillna(value=0, inplace=True)
# median_diff_time_gate_lag_1 = data["diff_time_gate_lag_1"].median()
# median_diff_time_gate_lag_2 = data["diff_time_gate_lag_2"].median()
# median_diff_time_gate_lag_3 = data["diff_time_gate_lag_3"].median()
# median_diff_time_gate_lag_4 = data["diff_time_gate_lag_4"].median()

# data["diff_time_gate_lag_1"].fillna(value=median_diff_time_gate_lag_1, inplace=True)
# data["diff_time_gate_lag_2"].fillna(value=median_diff_time_gate_lag_2, inplace=True)
# data["diff_time_gate_lag_3"].fillna(value=median_diff_time_gate_lag_3, inplace=True)
# data["diff_time_gate_lag_4"].fillna(value=median_diff_time_gate_lag_4, inplace=True)

data.replace({pd.NaT: "0"}, inplace=True)
data.head(50)

Unnamed: 0_level_0,user_id,timestamp,gate_id,timestamp_lag_1,timestamp_lag_2,timestamp_lag_3,timestamp_lag_4,diff_time_gate_lag_1,diff_time_gate_lag_2,diff_time_gate_lag_3,diff_time_gate_lag_4
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,18,2022-07-29 09:08:54,7,0,0,0,0,0.0,0.0,0.0,0.0
1,18,2022-07-29 09:09:54,9,0,0,0,0,0.0,0.0,0.0,0.0
2,18,2022-07-29 09:09:54,9,2022-07-29 09:09:54,0,0,0,0.0,0.0,0.0,0.0
3,18,2022-07-29 09:10:06,5,0,0,0,0,0.0,0.0,0.0,0.0
4,18,2022-07-29 09:10:08,5,2022-07-29 09:10:06,0,0,0,2.0,0.0,0.0,0.0
5,18,2022-07-29 09:10:34,10,0,0,0,0,0.0,0.0,0.0,0.0
6,18,2022-07-29 09:32:47,11,0,0,0,0,0.0,0.0,0.0,0.0
7,18,2022-07-29 09:33:12,4,0,0,0,0,0.0,0.0,0.0,0.0
8,18,2022-07-29 09:33:13,4,2022-07-29 09:33:12,0,0,0,1.0,0.0,0.0,0.0
9,1,2022-07-29 09:33:16,7,2022-07-29 09:08:54,0,0,0,1462.0,0.0,0.0,0.0


In [6]:
# создадим фичи для логистической регрессии на основе timestamp
# часы
data["hour"] = data["timestamp"].dt.hour
# минуты
data["minute"] = data["timestamp"].dt.minute
data["second"] = data["timestamp"].dt.second
# число
data["day"] = data["timestamp"].dt.day
# день недели
data["dayofweek"] = data["timestamp"].dt.dayofweek
# если выходной то 1, будни 0
data["is_weekend"] = data["dayofweek"] > 4
data["is_weekend"] = data["is_weekend"].apply(int)

data = data[data["hour"] != 6]

data.tail(5)

Unnamed: 0_level_0,user_id,timestamp,gate_id,timestamp_lag_1,timestamp_lag_2,timestamp_lag_3,timestamp_lag_4,diff_time_gate_lag_1,diff_time_gate_lag_2,diff_time_gate_lag_3,diff_time_gate_lag_4,hour,minute,second,day,dayofweek,is_weekend
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
37513,6,2022-12-31 20:38:56,11,2022-12-30 20:34:29,2022-12-30 19:46:47,2022-12-30 19:32:28,2022-12-30 19:31:45,86667.0,89529.0,90388.0,90431.0,20,38,56,31,5,1
37514,6,2022-12-31 20:39:22,6,2022-12-30 20:35:00,2022-12-30 20:34:58,2022-12-30 19:15:34,2022-12-30 19:15:33,86662.0,86664.0,91428.0,91429.0,20,39,22,31,5,1
37515,6,2022-12-31 20:39:23,6,2022-12-31 20:39:22,2022-12-30 20:35:00,2022-12-30 20:34:58,2022-12-30 19:15:34,1.0,86663.0,86665.0,91429.0,20,39,23,31,5,1
37516,6,2022-12-31 20:39:31,9,2022-12-31 17:20:40,2022-12-31 17:20:40,2022-12-30 20:35:06,2022-12-30 20:35:06,11931.0,11931.0,86665.0,86665.0,20,39,31,31,5,1
37517,6,2022-12-31 20:39:31,9,2022-12-31 20:39:31,2022-12-31 17:20:40,2022-12-31 17:20:40,2022-12-30 20:35:06,0.0,11931.0,11931.0,86665.0,20,39,31,31,5,1


In [7]:
data["time_to_sec"] = (
    data["hour"] * 3600 + data["minute"] * 60 + data["second"]
)
data.head(10)

Unnamed: 0_level_0,user_id,timestamp,gate_id,timestamp_lag_1,timestamp_lag_2,timestamp_lag_3,timestamp_lag_4,diff_time_gate_lag_1,diff_time_gate_lag_2,diff_time_gate_lag_3,diff_time_gate_lag_4,hour,minute,second,day,dayofweek,is_weekend,time_to_sec
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,18,2022-07-29 09:08:54,7,0,0,0,0,0.0,0.0,0.0,0.0,9,8,54,29,4,0,32934
1,18,2022-07-29 09:09:54,9,0,0,0,0,0.0,0.0,0.0,0.0,9,9,54,29,4,0,32994
2,18,2022-07-29 09:09:54,9,2022-07-29 09:09:54,0,0,0,0.0,0.0,0.0,0.0,9,9,54,29,4,0,32994
3,18,2022-07-29 09:10:06,5,0,0,0,0,0.0,0.0,0.0,0.0,9,10,6,29,4,0,33006
4,18,2022-07-29 09:10:08,5,2022-07-29 09:10:06,0,0,0,2.0,0.0,0.0,0.0,9,10,8,29,4,0,33008
5,18,2022-07-29 09:10:34,10,0,0,0,0,0.0,0.0,0.0,0.0,9,10,34,29,4,0,33034
6,18,2022-07-29 09:32:47,11,0,0,0,0,0.0,0.0,0.0,0.0,9,32,47,29,4,0,34367
7,18,2022-07-29 09:33:12,4,0,0,0,0,0.0,0.0,0.0,0.0,9,33,12,29,4,0,34392
8,18,2022-07-29 09:33:13,4,2022-07-29 09:33:12,0,0,0,1.0,0.0,0.0,0.0,9,33,13,29,4,0,34393
9,1,2022-07-29 09:33:16,7,2022-07-29 09:08:54,0,0,0,1462.0,0.0,0.0,0.0,9,33,16,29,4,0,34396


In [8]:
data["dayweek"] = data["dayofweek"]
data = pd.get_dummies(
    data, columns=["dayofweek", "gate_id", "hour"], dtype="int8"
)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37506 entries, 0 to 37517
Data columns (total 55 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   user_id               37506 non-null  int64         
 1   timestamp             37506 non-null  datetime64[ns]
 2   timestamp_lag_1       37506 non-null  object        
 3   timestamp_lag_2       37506 non-null  object        
 4   timestamp_lag_3       37506 non-null  object        
 5   timestamp_lag_4       37506 non-null  object        
 6   diff_time_gate_lag_1  37506 non-null  float64       
 7   diff_time_gate_lag_2  37506 non-null  float64       
 8   diff_time_gate_lag_3  37506 non-null  float64       
 9   diff_time_gate_lag_4  37506 non-null  float64       
 10  minute                37506 non-null  int32         
 11  second                37506 non-null  int32         
 12  day                   37506 non-null  int32         
 13  is_weekend           

In [9]:
lst_feachers = data.drop(
    columns=[
        "user_id",
        "timestamp",
        "timestamp_lag_1",
        "timestamp_lag_2",
        "timestamp_lag_3",
        "timestamp_lag_4",
        "diff_time_gate_lag_1",
        "diff_time_gate_lag_2",
        "diff_time_gate_lag_3",
        "diff_time_gate_lag_4",
        "dayofweek_0",
        "dayofweek_1",
        "dayofweek_2",
        "dayofweek_3",
        "dayofweek_4",
        "dayofweek_5",
        "dayofweek_6",
    ]
).columns
lst_feachers

Index(['minute', 'second', 'day', 'is_weekend', 'time_to_sec', 'dayweek',
       'gate_id_-1', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5',
       'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10',
       'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15',
       'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')

In [10]:
for i in lst_feachers:
    for j in range(1, 5):
        data[f"{i}_lag_{j}"] = data[f"{i}"].shift(j)

In [11]:
data["diff_time_to_sec"] = data["time_to_sec"].diff(1)
data["diff_time_to_sec_2"] = data["time_to_sec"].diff(2)
data["diff_time_to_sec_3"] = data["time_to_sec"].diff(3)
data["diff_time_to_sec_4"] = data["time_to_sec"].diff(4)

data["diff_minute"] = data["minute"].diff(1)
data["diff_minute_2"] = data["minute"].diff(2)
data["diff_minute_3"] = data["minute"].diff(3)
data["diff_minute_4"] = data["minute"].diff(4)

In [12]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37502 entries, 4 to 37517
Columns: 215 entries, user_id to diff_minute_4
dtypes: datetime64[ns](1), float64(164), int32(5), int64(2), int8(39), object(4)
memory usage: 51.3+ MB


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(
        columns=[
            "user_id",
            "timestamp",
            "timestamp_lag_1",
            "timestamp_lag_2",
            "timestamp_lag_3",
            "timestamp_lag_4",
        ]
    ),
    data["user_id"],
    random_state=15,
)
pipe = make_pipeline(
    StandardScaler(), LogisticRegression(n_jobs=-1, solver="saga")
)
pipe.fit(X_train, y_train)

print(pipe.score(X_test, y_test) * 100)
print(classification_report(y_test, pipe.predict(X_test)))
# 16.627559726962456

16.616894197952217
              precision    recall  f1-score   support

           0       0.08      0.04      0.05       317
           1       0.21      0.31      0.25       300
           2       0.00      0.00      0.00        10
           3       0.17      0.25      0.20       239
           5       0.00      0.00      0.00         2
           6       0.16      0.07      0.10       482
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00         4
           9       0.12      0.13      0.13       261
          10       0.00      0.00      0.00         1
          11       0.14      0.18      0.16       333
          12       0.16      0.36      0.22       494
          14       0.31      0.37      0.34       183
          15       0.16      0.18      0.17       472
          17       0.11      0.10      0.10       186
          18       0.31      0.21      0.25       424
          19       0.17      0.20      0.19       447
        

In [14]:
stop

NameError: name 'stop' is not defined

In [None]:
data_predict = pd.read_csv(
    "../data/raw/test_plus_4_raw.csv", index_col="row_id"
)
data_predict["timestamp"] = data_predict["timestamp"].astype("datetime64[ns]")
data_predict.head()

In [None]:
data_predict["hour"] = data_predict["timestamp"].dt.hour
# минуты
data_predict["minute"] = data_predict["timestamp"].dt.minute
data_predict["second"] = data_predict["timestamp"].dt.second
# число
data_predict["day"] = data_predict["timestamp"].dt.day
# день недели
data_predict["dayofweek"] = data_predict["timestamp"].dt.dayofweek
# если выходной то 1, будни 0
data_predict["is_weekend"] = data_predict["dayofweek"] > 4
data_predict["is_weekend"] = data_predict["is_weekend"].apply(int)

data_predict["time_to_sec"] = (
    data_predict["hour"] * 3600
    + data_predict["minute"] * 60
    + data_predict["second"]
)

data_predict["dayweek"] = data_predict["dayofweek"]
data_predict = pd.get_dummies(
    data_predict, columns=["dayofweek", "gate_id", "hour"], dtype="int8"
)

for i in lst_feachers:
    for j in range(1, 5):
        data_predict[f"{i}_lag_{j}"] = data_predict[f"{i}"].shift(j)

data_predict["diff_time_to_sec"] = data_predict["time_to_sec"].diff(1)
data_predict["diff_time_to_sec_2"] = data_predict["time_to_sec"].diff(2)
data_predict["diff_time_to_sec_3"] = data_predict["time_to_sec"].diff(3)
data_predict["diff_time_to_sec_4"] = data_predict["time_to_sec"].diff(4)

data_predict["diff_minute"] = data_predict["minute"].diff(1)
data_predict["diff_minute_2"] = data_predict["minute"].diff(2)
data_predict["diff_minute_3"] = data_predict["minute"].diff(3)
data_predict["diff_minute_4"] = data_predict["minute"].diff(4)


data_predict.info()

In [None]:
lst_column_for_predict = list(data.drop(columns=["user_id"]).columns)

In [None]:
data_predict = data_predict[[*lst_column_for_predict]]
data_predict.info()
data_predict.dropna(inplace=True)

In [None]:
data_predict["target"] = pipe.predict(data_predict.drop(columns=["timestamp"]))

In [None]:
data_predict.info()

In [None]:
data_out = data_predict["target"]
data_out.to_csv("../data/raw/sample_submission_4.sv")