In [44]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report
from time import time

In [45]:
SEED = 15

In [46]:
# Прочитаем данные, переведем timestamp в формат datetime64, изначально он считывается как object
data = pd.read_csv("../data/raw/train.csv", index_col='row_id')
data["timestamp"] = data["timestamp"].astype("datetime64[ns]")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    37518 non-null  int64         
 1   timestamp  37518 non-null  datetime64[ns]
 2   gate_id    37518 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.1 MB


In [47]:
# создадим фичи для логистической регрессии на основе timestamp
# часы
data['hour'] = data['timestamp'].dt.hour
# минуты
data['minute'] = data['timestamp'].dt.minute
# число
data['day'] = data['timestamp'].dt.day
# день недели
data['dayofweek'] = data['timestamp'].dt.dayofweek
# если выходной то 1, будни 0
data["is_weekend"] = data['dayofweek'] > 4
data["is_weekend"] = data['is_weekend'].apply(int)
data.head(5)

Unnamed: 0_level_0,user_id,timestamp,gate_id,hour,minute,day,dayofweek,is_weekend
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,18,2022-07-29 09:08:54,7,9,8,29,4,0
1,18,2022-07-29 09:09:54,9,9,9,29,4,0
2,18,2022-07-29 09:09:54,9,9,9,29,4,0
3,18,2022-07-29 09:10:06,5,9,10,29,4,0
4,18,2022-07-29 09:10:08,5,9,10,29,4,0


In [48]:
data = pd.get_dummies(data, columns=["dayofweek", "gate_id"])
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      37518 non-null  int64         
 1   timestamp    37518 non-null  datetime64[ns]
 2   hour         37518 non-null  int32         
 3   minute       37518 non-null  int32         
 4   day          37518 non-null  int32         
 5   is_weekend   37518 non-null  int64         
 6   dayofweek_0  37518 non-null  bool          
 7   dayofweek_1  37518 non-null  bool          
 8   dayofweek_2  37518 non-null  bool          
 9   dayofweek_3  37518 non-null  bool          
 10  dayofweek_4  37518 non-null  bool          
 11  dayofweek_5  37518 non-null  bool          
 12  dayofweek_6  37518 non-null  bool          
 13  gate_id_-1   37518 non-null  bool          
 14  gate_id_0    37518 non-null  bool          
 15  gate_id_1    37518 non-null  bool          
 16  gate_id_3

In [49]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["user_id", "timestamp"]), data["user_id"], random_state=SEED)
pipe = make_pipeline(StandardScaler(), LogisticRegression(n_jobs=-2
                                                         , solver='saga'))
pipe.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])
print(pipe.score(X_test, y_test))
print(classification_report(y_test, pipe.predict(X_test)))

0.1187633262260128
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       324
           1       0.14      0.12      0.13       337
           2       0.00      0.00      0.00         7
           3       0.16      0.06      0.09       256
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         5
           6       0.16      0.04      0.06       516
           7       0.00      0.00      0.00        11
           8       0.00      0.00      0.00         5
           9       0.00      0.00      0.00       242
          10       0.00      0.00      0.00         5
          11       0.04      0.00      0.01       309
          12       0.11      0.21      0.14       533
          14       0.10      0.10      0.10       175
          15       0.09      0.36      0.14       421
          17       0.00      0.00      0.00       163
          18       0.37      0.22      0.27       399
        

In [51]:
data_predict = pd.read_csv("../data/raw/test.csv", index_col='row_id')
data_predict["timestamp"] = data_predict["timestamp"].astype("datetime64[ns]")
data_predict.head()

Unnamed: 0_level_0,timestamp,gate_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1
37518,2023-01-03 08:21:00,9
37519,2023-01-03 08:21:00,9
37520,2023-01-03 08:21:18,5
37521,2023-01-03 08:21:19,5
37522,2023-01-03 08:21:39,10


In [52]:
data_predict['hour'] = data_predict['timestamp'].dt.hour
# минуты
data_predict['minute'] = data_predict['timestamp'].dt.minute
# число
data_predict['day'] = data_predict['timestamp'].dt.day
# день недели
data_predict['dayofweek'] = data_predict['timestamp'].dt.dayofweek
# если выходной то 1, будни 0
data_predict["is_weekend"] = data_predict['dayofweek'] > 4
data_predict["is_weekend"] = data_predict['is_weekend'].apply(int)

data_predict = pd.get_dummies(data_predict, columns=["dayofweek", "gate_id"])
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_1    7125 non-null   bool          
 14  gate_id_2    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [58]:
print(data.columns)

Index(['user_id', 'timestamp', 'hour', 'minute', 'day', 'is_weekend',
       'dayofweek_0', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3',
       'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'gate_id_-1', 'gate_id_0',
       'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6',
       'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11',
       'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16'],
      dtype='object')


In [56]:
# gate_id = 0 в тестовых данных нет, можно и в трейне удалить
data_predict['gate_id_0'] = 0
data_predict['gate_id_16'] = 0
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_1    7125 non-null   bool          
 14  gate_id_2    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [61]:
data_predict = data_predict[['timestamp', 'hour', 'minute', 'day', 'is_weekend',
       'dayofweek_0', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3',
       'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'gate_id_-1', 'gate_id_0',
       'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6',
       'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11',
       'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16']]
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_0    7125 non-null   int64         
 14  gate_id_1    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [73]:
data_predict["target"] = pipe.predict(data_predict.drop(columns=["timestamp"]))

In [74]:
data_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7125 entries, 37518 to 44642
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    7125 non-null   datetime64[ns]
 1   hour         7125 non-null   int32         
 2   minute       7125 non-null   int32         
 3   day          7125 non-null   int32         
 4   is_weekend   7125 non-null   int64         
 5   dayofweek_0  7125 non-null   bool          
 6   dayofweek_1  7125 non-null   bool          
 7   dayofweek_2  7125 non-null   bool          
 8   dayofweek_3  7125 non-null   bool          
 9   dayofweek_4  7125 non-null   bool          
 10  dayofweek_5  7125 non-null   bool          
 11  dayofweek_6  7125 non-null   bool          
 12  gate_id_-1   7125 non-null   bool          
 13  gate_id_0    7125 non-null   int64         
 14  gate_id_1    7125 non-null   bool          
 15  gate_id_3    7125 non-null   bool          
 16  gate_i

In [79]:
data_out = data_predict["target"]
data_out.to_csv("../data/raw/sample_submission_1.sv")