In [183]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report
from time import time

In [184]:
# Прочитаем данные, переведем timestamp в формат datetime64, изначально он считывается как object
data = pd.read_csv("../data/raw/train.csv", index_col="row_id")
data["timestamp"] = data["timestamp"].astype("datetime64[ns]")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    37518 non-null  int64         
 1   timestamp  37518 non-null  datetime64[ns]
 2   gate_id    37518 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.1 MB


In [185]:
# создадим фичи для логистической регрессии на основе timestamp
# часы
data["hour"] = data["timestamp"].dt.hour
# минуты
data["minute"] = data["timestamp"].dt.minute
data["second"] = data["timestamp"].dt.second
# число
data["day"] = data["timestamp"].dt.day
# день недели
data["dayofweek"] = data["timestamp"].dt.dayofweek
# если выходной то 1, будни 0
data["is_weekend"] = data["dayofweek"] > 4
data["is_weekend"] = data["is_weekend"].apply(int)
data.tail(5)

Unnamed: 0_level_0,user_id,timestamp,gate_id,hour,minute,second,day,dayofweek,is_weekend
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
37513,6,2022-12-31 20:38:56,11,20,38,56,31,5,1
37514,6,2022-12-31 20:39:22,6,20,39,22,31,5,1
37515,6,2022-12-31 20:39:23,6,20,39,23,31,5,1
37516,6,2022-12-31 20:39:31,9,20,39,31,31,5,1
37517,6,2022-12-31 20:39:31,9,20,39,31,31,5,1


In [186]:
data["time_to_sec"] = (
    data["hour"] * 3600 + data["minute"] * 60 + data["second"]
)
data.head(10)

Unnamed: 0_level_0,user_id,timestamp,gate_id,hour,minute,second,day,dayofweek,is_weekend,time_to_sec
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,18,2022-07-29 09:08:54,7,9,8,54,29,4,0,32934
1,18,2022-07-29 09:09:54,9,9,9,54,29,4,0,32994
2,18,2022-07-29 09:09:54,9,9,9,54,29,4,0,32994
3,18,2022-07-29 09:10:06,5,9,10,6,29,4,0,33006
4,18,2022-07-29 09:10:08,5,9,10,8,29,4,0,33008
5,18,2022-07-29 09:10:34,10,9,10,34,29,4,0,33034
6,18,2022-07-29 09:32:47,11,9,32,47,29,4,0,34367
7,18,2022-07-29 09:33:12,4,9,33,12,29,4,0,34392
8,18,2022-07-29 09:33:13,4,9,33,13,29,4,0,34393
9,1,2022-07-29 09:33:16,7,9,33,16,29,4,0,34396


In [187]:
data = pd.get_dummies(
    data, columns=["dayofweek", "gate_id", "hour"], dtype="int8"
)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 49 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      37518 non-null  int64         
 1   timestamp    37518 non-null  datetime64[ns]
 2   minute       37518 non-null  int32         
 3   second       37518 non-null  int32         
 4   day          37518 non-null  int32         
 5   is_weekend   37518 non-null  int64         
 6   time_to_sec  37518 non-null  int32         
 7   dayofweek_0  37518 non-null  int8          
 8   dayofweek_1  37518 non-null  int8          
 9   dayofweek_2  37518 non-null  int8          
 10  dayofweek_3  37518 non-null  int8          
 11  dayofweek_4  37518 non-null  int8          
 12  dayofweek_5  37518 non-null  int8          
 13  dayofweek_6  37518 non-null  int8          
 14  gate_id_-1   37518 non-null  int8          
 15  gate_id_0    37518 non-null  int8          
 16  gate_id_1

In [188]:
lst_feachers = data.drop(
    columns=[
        "user_id",
        "timestamp",
        "dayofweek_0",
        "dayofweek_1",
        "dayofweek_2",
        "dayofweek_3",
        "dayofweek_4",
        "dayofweek_5",
        "dayofweek_6",
    ]
).columns
lst_feachers

Index(['minute', 'second', 'day', 'is_weekend', 'time_to_sec', 'gate_id_-1',
       'gate_id_0', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5',
       'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10',
       'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15',
       'gate_id_16', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23'],
      dtype='object')

In [189]:
for i in lst_feachers:
    for j in range(1, 5):
        data[f"{i}_lag_{j}"] = data[f"{i}"].shift(j)

In [190]:
data["diff_time_to_sec"] = data["time_to_sec"].diff()
data["diff_time_to_sec_2"] = data["time_to_sec"].diff(2)
data["diff_time_to_sec_3"] = data["time_to_sec"].diff(3)
data["diff_time_to_sec_4"] = data["time_to_sec"].diff(4)

In [191]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37513 entries, 5 to 37517
Columns: 253 entries, user_id to diff_time_to_sec_4
dtypes: datetime64[ns](1), float64(204), int32(4), int64(2), int8(42)
memory usage: 61.6 MB


In [192]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=["user_id", "timestamp"]),
    data["user_id"],
    random_state=15,
)
pipe = make_pipeline(
    StandardScaler(), LogisticRegression(n_jobs=-2, solver="saga")
)
pipe.fit(X_train, y_train)
Pipeline(
    steps=[
        ("standardscaler", StandardScaler()),
        ("logisticregression", LogisticRegression()),
    ]
)
print(pipe.score(X_test, y_test) * 100)
print(classification_report(y_test, pipe.predict(X_test)))
# 17.123360699434908

16.867469879518072
              precision    recall  f1-score   support

           0       0.11      0.05      0.07       315
           1       0.20      0.28      0.24       301
           2       0.00      0.00      0.00        12
           3       0.15      0.27      0.19       236
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         3
           6       0.11      0.06      0.08       477
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         8
           9       0.13      0.18      0.15       242
          10       0.00      0.00      0.00         5
          11       0.12      0.20      0.15       321
          12       0.19      0.35      0.25       550
          14       0.35      0.37      0.36       187
          15       0.14      0.20      0.16       436
          17       0.15      0.15      0.15       155
          18       0.29      0.23      0.26       391
        

In [47]:
data_predict = pd.read_csv("../data/raw/test.csv", index_col="row_id")
data_predict["timestamp"] = data_predict["timestamp"].astype("datetime64[ns]")
data_predict.head()

Unnamed: 0_level_0,timestamp,gate_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1
37518,2023-01-03 08:21:00,9
37519,2023-01-03 08:21:00,9
37520,2023-01-03 08:21:18,5
37521,2023-01-03 08:21:19,5
37522,2023-01-03 08:21:39,10


In [48]:
data_predict["gate_id"].value_counts()

gate_id
 4     1459
 10     915
 3      909
 5      860
 9      780
 11     762
 7      540
 13     345
 6      294
 12     132
 15     102
 8       18
 2        4
 1        2
-1        2
 14       1
Name: count, dtype: int64

In [52]:
data_predict["hour"] = data_predict["timestamp"].dt.hour
# минуты
data_predict["minute"] = data_predict["timestamp"].dt.minute
# число
data_predict["day"] = data_predict["timestamp"].dt.day
# день недели
data_predict["dayofweek"] = data_predict["timestamp"].dt.dayofweek
# если выходной то 1, будни 0
data_predict["is_weekend"] = data_predict["dayofweek"] > 4
data_predict["is_weekend"] = data_predict["is_weekend"].apply(int)

data_predict = pd.get_dummies(data_predict, columns=["dayofweek", "gate_id"])
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_1    7125 non-null   bool          
 14  gate_id_2    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [58]:
print(data.columns)

Index(['user_id', 'timestamp', 'hour', 'minute', 'day', 'is_weekend',
       'dayofweek_0', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3',
       'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'gate_id_-1', 'gate_id_0',
       'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6',
       'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11',
       'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16'],
      dtype='object')


In [56]:
# gate_id = 0 в тестовых данных нет, можно и в трейне удалить
data_predict["gate_id_0"] = 0
data_predict["gate_id_16"] = 0
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_1    7125 non-null   bool          
 14  gate_id_2    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [61]:
data_predict = data_predict[
    [
        "timestamp",
        "hour",
        "minute",
        "day",
        "is_weekend",
        "dayofweek_0",
        "dayofweek_1",
        "dayofweek_2",
        "dayofweek_3",
        "dayofweek_4",
        "dayofweek_5",
        "dayofweek_6",
        "gate_id_-1",
        "gate_id_0",
        "gate_id_1",
        "gate_id_3",
        "gate_id_4",
        "gate_id_5",
        "gate_id_6",
        "gate_id_7",
        "gate_id_8",
        "gate_id_9",
        "gate_id_10",
        "gate_id_11",
        "gate_id_12",
        "gate_id_13",
        "gate_id_14",
        "gate_id_15",
        "gate_id_16",
    ]
]
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_0    7125 non-null   int64         
 14  gate_id_1    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [73]:
data_predict["target"] = pipe.predict(data_predict.drop(columns=["timestamp"]))

In [74]:
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_0    7125 non-null   int64         
 14  gate_id_1    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [79]:
data_out = data_predict["target"]
data_out.to_csv("../data/raw/sample_submission_1.sv")