## Описание данных

In [1]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report

gate_id - id пункта контроля(шлагбаум, турникет, дверь на этаже)

timestamp - дата и время прохода через ПК

user_id - id пользователя 

In [2]:
# Прочитаем данные, переведем timestamp в формат datetime64, изначально он считывается как object
data = pd.read_csv("../data/raw/train.csv", index_col="row_id")
data["timestamp"] = data["timestamp"].astype("datetime64[ns]")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   user_id    37518 non-null  int64         
 1   timestamp  37518 non-null  datetime64[ns]
 2   gate_id    37518 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.1 MB


In [3]:
data.head(5)

Unnamed: 0_level_0,user_id,timestamp,gate_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,18,2022-07-29 09:08:54,7
1,18,2022-07-29 09:09:54,9
2,18,2022-07-29 09:09:54,9
3,18,2022-07-29 09:10:06,5
4,18,2022-07-29 09:10:08,5


## Вопрос №1

Постройте логистическую регрессию, оптимизатор liblinear, остальные параметры дефолтные, на 6 признаках: ['gate_id', 'hour', 'min', 'day', 'month','dayofweek']. Признаки делаем из "ts", в частности "min" - это минуты из времени. Используем масштабирование StandardScaler.

Каково будет качество на обучающей выборке и на валидации (округлите до целых и выберите ближайший ответ)

In [4]:
# создадим фичи для логистической регрессии на основе timestamp
# часы
data["hour"] = data["timestamp"].dt.hour
# минуты
data["minute"] = data["timestamp"].dt.minute
# число
data["day"] = data["timestamp"].dt.day
# месяц
data["month"] = data["timestamp"].dt.month
# день недели
data["dayofweek"] = data["timestamp"].dt.dayofweek
# если выходной то 1, будни 0
# data["is_weekend"] = data['dayofweek'] > 4
# data["is_weekend"] = data['is_weekend'].apply(int)
data.head(5)

Unnamed: 0_level_0,user_id,timestamp,gate_id,hour,minute,day,month,dayofweek
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,18,2022-07-29 09:08:54,7,9,8,29,7,4
1,18,2022-07-29 09:09:54,9,9,9,29,7,4
2,18,2022-07-29 09:09:54,9,9,9,29,7,4
3,18,2022-07-29 09:10:06,5,9,10,29,7,4
4,18,2022-07-29 09:10:08,5,9,10,29,7,4


In [5]:
# создадим функию чтобы разбивать датафрейм на X_train, X_test, y_train, y_test
def data_stlit_to_train_test(df: pd.DataFrame):
    data_train = df[df["timestamp"] < "2022-11-01 00:00:00"]
    data_test = df[df["timestamp"] >= "2022-11-01 00:00:00"]
    X_train, y_train = (
        data_train.drop(columns=["user_id", "timestamp"]),
        data_train["user_id"],
    )
    X_test, y_test = (
        data_test.drop(columns=["user_id", "timestamp"]),
        data_test["user_id"],
    )
    return X_train, X_test, y_train, y_test

In [8]:
# разбиваем данные с помощью объявленной функции
X_train, X_test, y_train, y_test = data_stlit_to_train_test(df=data)

In [9]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(solver="liblinear"))
pipe.fit(X_train, y_train)

print("На трайне", pipe.score(X_train, y_train) * 100)
print("На вадидации", pipe.score(X_test, y_test) * 100)

На трайне 9.66684057538943
На вадидации 4.782852864095844


## Вопрос №2 и 3

Постройте логистическую регрессию на дефолтных параметрах, но random_state=1, solver=saga, и 13 признаках:
['gate_id', 'hour', 'min', 'day', 'month', 'dayofweek','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'].  Используем масштабирование StandardScaler.

In [10]:
data_2 = data
data_2["Monday"] = data["dayofweek"] == 0
data_2["Monday"] = data["Monday"].apply(int)
data_2["Tuesday"] = data["dayofweek"] == 1
data_2["Tuesday"] = data["Tuesday"].apply(int)
data_2["Wednesday"] = data["dayofweek"] == 2
data_2["Wednesday"] = data["Wednesday"].apply(int)
data_2["Thursday"] = data["dayofweek"] == 3
data_2["Thursday"] = data["Thursday"].apply(int)
data_2["Friday"] = data["dayofweek"] == 4
data_2["Friday"] = data["Friday"].apply(int)
data_2["Saturday"] = data["dayofweek"] == 5
data_2["Saturday"] = data["Saturday"].apply(int)
data_2["Sunday"] = data["dayofweek"] == 6
data_2["Sunday"] = data["Sunday"].apply(int)
data_2.head(3)

Unnamed: 0_level_0,user_id,timestamp,gate_id,hour,minute,day,month,dayofweek,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,18,2022-07-29 09:08:54,7,9,8,29,7,4,0,0,0,0,1,0,0
1,18,2022-07-29 09:09:54,9,9,9,29,7,4,0,0,0,0,1,0,0
2,18,2022-07-29 09:09:54,9,9,9,29,7,4,0,0,0,0,1,0,0


In [11]:
# data_2 = data_2.drop(columns=["dayofweek"])
X_train, X_test, y_train, y_test = data_stlit_to_train_test(df=data_2)
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        # penalty="l1",
        random_state=1,
        solver="saga",
        n_jobs=-2
        # , multi_class="multinomial"
        # ,  l1_ratio=0.5
    ),
)
pipe.fit(X_train, y_train)
print("На трайне", pipe.score(X_train, y_train) * 100)
print("На вадидации", pipe.score(X_test, y_test) * 100)
# На трайне 10.523962137586643
# На вадидации 4.6892549606888805

На трайне 10.523962137586643
На вадидации 4.6892549606888805


In [12]:
X_train.head()

Unnamed: 0_level_0,gate_id,hour,minute,day,month,dayofweek,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7,9,8,29,7,4,0,0,0,0,1,0,0
1,9,9,9,29,7,4,0,0,0,0,1,0,0
2,9,9,9,29,7,4,0,0,0,0,1,0,0
3,5,9,10,29,7,4,0,0,0,0,1,0,0
4,5,9,10,29,7,4,0,0,0,0,1,0,0


## Вопрос №4

Постройте логистическую регрессию на дефолтных параметрах и 33 признаках:
['gate_-1', 'gate_0', 'gate_1',
       'gate_3', 'gate_4', 'gate_5', 'gate_6', 'gate_7', 'gate_8', 'gate_9',
       'gate_10', 'gate_11', 'gate_12', 'gate_13', 'gate_14', 'gate_15',
       'gate_16', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
       'Saturday', 'Sunday', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12', 'hour', 'min', 'day'].  Используем масштабирование StandardScaler.

In [13]:
data_4 = pd.get_dummies(data_2, columns=["gate_id", "month"])
data_4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 36 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     37518 non-null  int64         
 1   timestamp   37518 non-null  datetime64[ns]
 2   hour        37518 non-null  int32         
 3   minute      37518 non-null  int32         
 4   day         37518 non-null  int32         
 5   dayofweek   37518 non-null  int32         
 6   Monday      37518 non-null  int64         
 7   Tuesday     37518 non-null  int64         
 8   Wednesday   37518 non-null  int64         
 9   Thursday    37518 non-null  int64         
 10  Friday      37518 non-null  int64         
 11  Saturday    37518 non-null  int64         
 12  Sunday      37518 non-null  int64         
 13  gate_id_-1  37518 non-null  bool          
 14  gate_id_0   37518 non-null  bool          
 15  gate_id_1   37518 non-null  bool          
 16  gate_id_3   37518 non-null 

In [22]:
X_train, X_test, y_train, y_test = data_stlit_to_train_test(df=data_4)
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(n_jobs=-2, solver="saga", random_state=1),
)
pipe.fit(X_train, y_train)
Pipeline(
    steps=[
        ("standardscaler", StandardScaler()),
        ("logisticregression", LogisticRegression()),
    ]
)
print("На трайне", pipe.score(X_train, y_train) * 100)
print("На вадидации", pipe.score(X_test, y_test) * 100)
# На трайне 14.45554147723038
# На вадидации 7.375514788468738

На трайне 14.45554147723038
На вадидации 7.375514788468738


In [23]:
# В валидационной выборке больше 700 истинных предсказаний?
print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.06      0.00      0.01       456
           1       0.27      0.06      0.10       557
           2       0.00      0.00      0.00        39
           3       0.31      0.06      0.09       286
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00       571
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00       238
          10       0.00      0.00      0.00        17
          11       0.00      0.00      0.00       363
          12       0.06      0.23      0.10       519
          14       0.00      0.00      0.00       316
          15       0.06      0.25      0.10       459
          17       0.00      0.00      0.00       482
          18       0.00      0.05      0.01        37
          19       0.08      0.06      0.07       375
          21       0.00    

In [20]:
# user1 чаще всего приходит в четверг?
print(pipe.named_steps["logisticregression"].coef_[0][5:12])

[ 0.00436534 -0.02867387  0.15878652 -0.07313598 -0.25553347 -0.14738372
 -0.06626029]


## Вопрос №6

 Постройте логистическую регрессию на дефолтных параметрах и 50 признаках:
    ['gate_-1', 'gate_0', 'gate_1', 'gate_3', 'gate_4', 'gate_5', 'gate_6', 'gate_7', 'gate_8', 'gate_9', 'gate_10', 'gate_11', 'gate_12', 'gate_13', 'gate_14', 'gate_15', 'gate_16', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'min', 'day'].  Используем масштабирование StandardScaler.

In [51]:
data_6 = pd.get_dummies(data_4, columns=["hour"])
data_6.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37518 entries, 0 to 37517
Data columns (total 53 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   user_id     37518 non-null  int64         
 1   timestamp   37518 non-null  datetime64[ns]
 2   minute      37518 non-null  int32         
 3   day         37518 non-null  int32         
 4   dayofweek   37518 non-null  int32         
 5   Monday      37518 non-null  int64         
 6   Tuesday     37518 non-null  int64         
 7   Wednesday   37518 non-null  int64         
 8   Thursday    37518 non-null  int64         
 9   Friday      37518 non-null  int64         
 10  Saturday    37518 non-null  int64         
 11  Sunday      37518 non-null  int64         
 12  gate_id_-1  37518 non-null  bool          
 13  gate_id_0   37518 non-null  bool          
 14  gate_id_1   37518 non-null  bool          
 15  gate_id_3   37518 non-null  bool          
 16  gate_id_4   37518 non-null 

In [27]:
X_train, X_test, y_train, y_test = data_stlit_to_train_test(df=data_6)
lst = ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]

for i in lst:
    pipe = make_pipeline(
        StandardScaler(),
        LogisticRegression(n_jobs=-2, solver=i, max_iter=100, random_state=1),
    )
    pipe.fit(X_train, y_train)  # apply scaling on training data
    Pipeline(
        steps=[
            ("standardscaler", StandardScaler()),
            ("logisticregression", LogisticRegression()),
        ]
    )
    print(i, pipe.score(X_test, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs 0.09256832646948708
liblinear 0.09481467615125422
newton-cg 0.09247472856608012
newton-cholesky 0.0943466866342194
sag 0.09518906776488206
saga 0.09556345937850992


In [34]:
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        n_jobs=-2,
        solver="saga",
        max_iter=80,
        random_state=1
        # , C=0.6
        # , class_weight='balanced'
    ),
)
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(
    steps=[
        ("standardscaler", StandardScaler()),
        ("logisticregression", LogisticRegression()),
    ]
)
print("На обучающей", pipe.score(X_train, y_train) * 100)
print("На вадидации", pipe.score(X_test, y_test) * 100)
# На обучающей 17.40702094357904
# На вадидации 9.556345937850992

На обучающей 17.421927405530298
На вадидации 9.603144889554475


## Вопрос №9

Улучшится ли качество модели на валидационной выборке, если в качестве признаков использовать предыдущее наблюдение?
Оцените для регрессии из Q6 и оптимизатора saga (random_state = 1)?

In [44]:
data_9 = data_6
data_9.head(3)

Unnamed: 0_level_0,user_id,timestamp,minute,day,dayofweek,Monday,Tuesday,Wednesday,Thursday,Friday,...,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,user_id_lag_1
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,18,2022-07-29 09:08:54,8,29,4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,
1,18,2022-07-29 09:09:54,9,29,4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,18.0
2,18,2022-07-29 09:09:54,9,29,4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,18.0


In [45]:
data_9["user_id_lag_1"] = data_9["user_id"].shift(1)
data_9 = data_9.dropna(subset=["user_id_lag_1"])
data_9.head(2)

Unnamed: 0_level_0,user_id,timestamp,minute,day,dayofweek,Monday,Tuesday,Wednesday,Thursday,Friday,...,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,user_id_lag_1
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,18,2022-07-29 09:09:54,9,29,4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,18.0
2,18,2022-07-29 09:09:54,9,29,4,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,18.0


In [46]:
X_train, X_test, y_train, y_test = data_stlit_to_train_test(df=data_9)
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(n_jobs=-2, solver="saga", random_state=1),
)
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(
    steps=[
        ("standardscaler", StandardScaler()),
        ("logisticregression", LogisticRegression()),
    ]
)
print("На обучающей", pipe.score(X_train, y_train) * 100)
print("На вадидации", pipe.score(X_test, y_test) * 100)

На обучающей 22.472328848805574
На вадидации 14.610632721827029


## Вопрос №10

Проверьте, помогает ли качеству регрессии из Q6 и оптимизатора saga больше данных,
оцените разницу между 60%, 80% и 100% от train

In [52]:
y_train.count()
# 60% = 16129

26834

In [53]:
X_train, X_test, y_train, y_test = data_stlit_to_train_test(df=data_6)
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(n_jobs=-2, solver="saga", random_state=1),
)
pipe.fit(X_train, y_train)  # apply scaling on training data
Pipeline(
    steps=[
        ("standardscaler", StandardScaler()),
        ("logisticregression", LogisticRegression()),
    ]
)
print("На обучающей", pipe.score(X_train, y_train) * 100)
print("На вадидации", pipe.score(X_test, y_test) * 100)

На обучающей 17.40702094357904
На вадидации 9.556345937850992


In [54]:
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(n_jobs=-2, solver="saga", random_state=1),
)
pipe.fit(
    X_train.iloc[:16129], y_train.iloc[:16129]
)  # apply scaling on training data
Pipeline(
    steps=[
        ("standardscaler", StandardScaler()),
        ("logisticregression", LogisticRegression()),
    ]
)
print("На обучающей", pipe.score(X_train, y_train) * 100)
print("На вадидации", pipe.score(X_test, y_test) * 100)

На обучающей 15.834389207721546
На вадидации 8.545488581055784
