## Описание данных

In [29]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report

gate_id - id пункта контроля(шлагбаум, турникет, дверь на этаже)

timestamp - дата и время прохода через ПК

user_id - id пользователя 

In [30]:
# Прочитаем данные, переведем timestamp в формат datetime64, изначально он считывается как object
data = pd.read_csv("../data/raw/train.csv", index_col='row_id')
data["timestamp"] = data["timestamp"].astype("datetime64[ns]")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    37518 non-null  int64         
 1   timestamp  37518 non-null  datetime64[ns]
 2   gate_id    37518 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.1 MB


Мы имеем 37518 записей, но в первых же строках видно, что имеются дубликаты

In [3]:
data.head(10)

Unnamed: 0_level_0,user_id,timestamp,gate_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,18,2022-07-29 09:08:54,7
1,18,2022-07-29 09:09:54,9
2,18,2022-07-29 09:09:54,9
3,18,2022-07-29 09:10:06,5
4,18,2022-07-29 09:10:08,5
5,18,2022-07-29 09:10:34,10
6,18,2022-07-29 09:32:47,11
7,18,2022-07-29 09:33:12,4
8,18,2022-07-29 09:33:13,4
9,1,2022-07-29 09:33:16,7


In [4]:
# Удалим дубликаты используя user_id, timestamp, gate_id
# data = data.drop_duplicates(subset=['user_id', 'timestamp', 'gate_id'])
# data.count()
# Итого у нас осталось 35172, более 2000 дубликатов 

In [5]:
# Посмотрим на один день пользователя
data[(data["user_id"] == 12) & (data["timestamp"] <= "2022-07-30")].tail(10)

Unnamed: 0_level_0,user_id,timestamp,gate_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180,12,2022-07-29 12:46:45,13
187,12,2022-07-29 12:51:22,13
188,12,2022-07-29 12:51:23,13
261,12,2022-07-29 14:09:43,4
263,12,2022-07-29 14:09:45,4
302,12,2022-07-29 15:00:06,5
304,12,2022-07-29 15:00:08,5
378,12,2022-07-29 16:49:35,11
379,12,2022-07-29 16:50:01,4
380,12,2022-07-29 16:50:03,4


Возможно дубликаты связаны с тем, что пользователи по несколько раз используют пропуск на одном ПК

18	2022-07-29 09:33:41	5

18	2022-07-29 09:33:42	5

18	2022-07-29 14:37:34	5

18	2022-07-29 14:37:35	5

## Вопрос №1

In [32]:
# создадим фичи для логистической регрессии на основе timestamp
# часы
data['hour'] = data['timestamp'].dt.hour
# минуты
data['minute'] = data['timestamp'].dt.minute
# число
data['day'] = data['timestamp'].dt.day
# месяц
data['month'] = data['timestamp'].dt.month
# день недели
data['dayofweek'] = data['timestamp'].dt.dayofweek
# если выходной то 1, будни 0
data["is_weekend"] = data['dayofweek'] > 4
data["is_weekend"] = data['is_weekend'].apply(int)
data.head(5)

Unnamed: 0_level_0,user_id,timestamp,gate_id,hour,minute,day,month,dayofweek,is_weekend
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,18,2022-07-29 09:08:54,7,9,8,29,7,4,0
1,18,2022-07-29 09:09:54,9,9,9,29,7,4,0
2,18,2022-07-29 09:09:54,9,9,9,29,7,4,0
3,18,2022-07-29 09:10:06,5,9,10,29,7,4,0
4,18,2022-07-29 09:10:08,5,9,10,29,7,4,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["user_id", "timestamp"]), data["user_id"], random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

pipe.score(X_test, y_test)  # apply scaling on testing data, without leaking training data.

0.09616204690831556

## Вопрос №2 и 3

In [33]:
data["Monday"] = data['dayofweek'] == 0
data["Monday"] = data['Monday'].apply(int)
data["Tuesday"] = data['dayofweek'] == 1
data["Tuesday"] = data['Tuesday'].apply(int)
data["Wednesday"] = data['dayofweek'] == 2
data["Wednesday"] = data['Wednesday'].apply(int)
data["Thursday"] = data['dayofweek'] == 3
data["Thursday"] = data['Thursday'].apply(int)
data["Friday"] = data['dayofweek'] == 4
data["Friday"] = data['Friday'].apply(int)
data["Saturday"] = data['dayofweek'] == 5
data["Saturday"] = data['Saturday'].apply(int)
data["Sunday"] = data['dayofweek'] == 6
data["Sunday"] = data['Sunday'].apply(int)
data.head(5)

Unnamed: 0_level_0,user_id,timestamp,gate_id,hour,minute,day,month,dayofweek,is_weekend,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,18,2022-07-29 09:08:54,7,9,8,29,7,4,0,0,0,0,0,1,0,0
1,18,2022-07-29 09:09:54,9,9,9,29,7,4,0,0,0,0,0,1,0,0
2,18,2022-07-29 09:09:54,9,9,9,29,7,4,0,0,0,0,0,1,0,0
3,18,2022-07-29 09:10:06,5,9,10,29,7,4,0,0,0,0,0,1,0,0
4,18,2022-07-29 09:10:08,5,9,10,29,7,4,0,0,0,0,0,1,0,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["user_id", "timestamp"]), data["user_id"], random_state=1)
pipe = make_pipeline(StandardScaler(), LogisticRegression(
    # penalty="elasticnet", 
    random_state=1, solver='saga', n_jobs=-2
    , multi_class="ovr"
    # ,  l1_ratio=0.5
))
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])
pipe.score(X_test, y_test)

0.09573560767590618

In [10]:
# 0.088272921108742

## Вопрос №4

In [11]:
pd.get_dummies(data, columns=["gate_id", "month"]).info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 37 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     37518 non-null  int64         
 1   timestamp   37518 non-null  datetime64[ns]
 2   hour        37518 non-null  int32         
 3   minute      37518 non-null  int32         
 4   day         37518 non-null  int32         
 5   dayofweek   37518 non-null  int32         
 6   is_weekend  37518 non-null  int64         
 7   Monday      37518 non-null  int64         
 8   Tuesday     37518 non-null  int64         
 9   Wednesday   37518 non-null  int64         
 10  Thursday    37518 non-null  int64         
 11  Friday      37518 non-null  int64         
 12  Saturday    37518 non-null  int64         
 13  Sunday      37518 non-null  int64         
 14  gate_id_-1  37518 non-null  bool          
 15  gate_id_0   37518 non-null  bool          
 16  gate_id_1   37518 non-null 

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["user_id", "timestamp"]), data["user_id"], random_state=1)
pipe = make_pipeline(StandardScaler(), LogisticRegression(n_jobs=-2
                                                         ,solver='lbfgs'))
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])
pipe.score(X_test, y_test)
# 0.09541577825159915

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.09584221748400854

In [26]:
print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       322
           1       0.17      0.12      0.14       334
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00       253
           5       0.00      0.00      0.00         5
           6       0.19      0.04      0.07       504
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00         6
           9       0.00      0.00      0.00       270
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00       316
          12       0.10      0.04      0.06       509
          14       0.02      0.04      0.03       163
          15       0.06      0.12      0.08       416
          17       0.09      0.18      0.12       151
          18       0.12      0.39      0.19       407
          19       0.07      0.04      0.05       444
          20       0.00    

## Вопрос №5

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["user_id", "timestamp"]), data["user_id"], random_state=1)
pipe = make_pipeline(StandardScaler(), LogisticRegression(n_jobs=-2
                                                         , solver='saga'))
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])
pipe.score(X_test, y_test)
print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       322
           1       0.17      0.11      0.13       334
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00       253
           5       0.00      0.00      0.00         5
           6       0.19      0.04      0.07       504
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00         6
           9       0.00      0.00      0.00       270
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00       316
          12       0.10      0.04      0.06       509
          14       0.02      0.04      0.03       163
          15       0.06      0.12      0.08       416
          17       0.09      0.18      0.12       151
          18       0.12      0.39      0.19       407
          19       0.07      0.04      0.05       444
          20       0.00    

In [34]:
pd.get_dummies(data, columns=["gate_id", "month", "hour"]).info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     37518 non-null  int64         
 1   timestamp   37518 non-null  datetime64[ns]
 2   minute      37518 non-null  int32         
 3   day         37518 non-null  int32         
 4   dayofweek   37518 non-null  int32         
 5   is_weekend  37518 non-null  int64         
 6   Monday      37518 non-null  int64         
 7   Tuesday     37518 non-null  int64         
 8   Wednesday   37518 non-null  int64         
 9   Thursday    37518 non-null  int64         
 10  Friday      37518 non-null  int64         
 11  Saturday    37518 non-null  int64         
 12  Sunday      37518 non-null  int64         
 13  gate_id_-1  37518 non-null  bool          
 14  gate_id_0   37518 non-null  bool          
 15  gate_id_1   37518 non-null  bool          
 16  gate_id_3   37518 non-null 

In [44]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["user_id", "timestamp"]), data["user_id"], random_state=1)
lst = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

pipe = make_pipeline(StandardScaler(), LogisticRegression(n_jobs=-2
                                                         , solver="saga", max_iter=80, random_state=1, C=0.6))
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])
print(pipe.score(X_test, y_test))
# 0.09584221748400854

0.09584221748400854
