# Проект: Идентификация пользователя по последовательности посещенных сайтов 

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn; seaborn.set()

In [3]:
PATH_TO_DATA = 'capstone_user_identification'
data_train= pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id')
data_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'),  index_col='session_id')

In [4]:
print (data_train.shape)
print (data_test.shape)

(253561, 21)
(82797, 20)


In [5]:
data_train.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,,,,,,,,...,,,,,,,,,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [6]:
site_id = ['site1', 'site2', 'site3', 'site4', 'site5', 'site6', 'site7', 'site8', 'site9', 'site10']
time_site = list(set(data_train.columns.values.tolist()) - set(site_id) - set(['target']))
print(time_site)

['time8', 'time9', 'time4', 'time5', 'time6', 'time7', 'time1', 'time2', 'time3', 'time10']


In [7]:
for time in time_site:
    data_test[time] = pd.to_datetime(data_test[time])
    data_train[time] = pd.to_datetime(data_train[time])

#### Создадим новый признак, показывающий колличество открытых вкладок в каждой сессии.

In [8]:
data_train['open_v']=10-np.sum(data_train[site_id].isnull(),axis=1)
data_test['open_v']=10-np.sum(data_test[site_id].isnull(),axis=1)

#### Отсортируем данные по времени начала сессии (т.е время посещения первого сайта time1). 

In [9]:
data_train = data_train.sort_values(by='time1')

In [10]:
data_train[site_id].fillna(0).to_csv('train_sessions_text.txt', sep=' ', index=None, header=None)
data_test[site_id].fillna(0).to_csv('test_sessions_text.txt', sep=' ', index=None, header=None)

#### Преобразуем данные к виду: [session_id] [site_id1....site_idn...site_id1_2....site_idn_k]  т.е. подсчитаем для каждой сессии сколько раз был посещен тот или иной сайт и сколько раз группы сайтов посещались вместе (n-gramma)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train_transform = vectorizer.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_train_file:
    X_test_transform = vectorizer.transform(inp_train_file)    

In [12]:
print (X_train_transform.shape)
print (X_test_transform.shape)

(253561, 50000)
(82797, 50000)


#### Cоздадим следующие признаки:  Когда началась сессия: а) выходной или будний день, б) утром,днем,вечером или ночью

In [15]:
from scipy.sparse import hstack
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    day_of_week = df['time1'].apply(lambda date: date.dayofweek)
    weekend = (day_of_week>4).astype('int')
    weekday = (day_of_week<5).astype('int')
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1), weekend.values.reshape(-1, 1), weekday.values.reshape(-1, 1)])
    return X

In [16]:
X_train = add_time_features(data_train.fillna(0), X_train_transform)
X_test = add_time_features(data_test.fillna(0), X_test_transform)

In [17]:
y = data_train['target']

#### В качетсве алгоритма (т.к. очень много признаков и матрицы разряженные) выберем линейную регрессию. Проверим алгоритм на TimeSeries кросс-валидации, т.к. сессии между собой также связаны по времени.

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit #для данных с временной привязкой
from sklearn.model_selection import cross_val_score
model = LogisticRegression(C=1, random_state=17, solver='liblinear')

cv = TimeSeriesSplit(n_splits=10)
cv_scores = cross_val_score(model, X_train, y.astype('int'), cv=cv, scoring='roc_auc', n_jobs=1)
cv_scores.mean()

0.9107144155016706

#### Получили хорошу оценку на кросс-валидации, поэтому обучим модель и сделаем первые предсказания на тестовом наборе данных

In [19]:
model.fit(X_train,y)
y_prob = model.predict_proba(X_test)

In [20]:
y_prob[:,1]

array([1.19759983e-05, 3.83841225e-08, 7.41228044e-08, ...,
       1.24752215e-04, 5.86730149e-06, 1.30442299e-07])

In [21]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [22]:
write_to_submission_file(y_prob[:,1], 'out_file_1.csv', target='target', index_label="session_id")

In [24]:
out = pd.read_csv('out_file_1.csv')
out.head()

Unnamed: 0,session_id,target
0,1,1.1976e-05
1,2,3.838412e-08
2,3,7.41228e-08
3,4,7.823858e-09
4,5,1.281554e-05


#### Попробуем улучшить качество модели путем подбора параметров регуляризации с помощью grid_search

In [25]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 5], 'penalty': ['l1','l2']}
model = LogisticRegression(random_state=17, solver='liblinear')
grid_cv = GridSearchCV(model, param_grid, scoring='roc_auc', n_jobs=1, cv=cv, verbose=True)
grid_cv.fit(X_train,y)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 13.0min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=True)

In [26]:
print (grid_cv.best_estimator_) # лучший алгоритм
print (grid_cv.best_params_) # лучший набор параметров

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
{'penalty': 'l2', 'C': 0.1}


In [27]:
y_prob_gs = grid_cv.predict_proba(X_test)

In [28]:
write_to_submission_file(y_prob_gs[:,1], 'out_file_2.csv', target='target', index_label="session_id")

### Результирующая оценка  метрики aucroc согласно сайту KAGGLE составила: 0.94657 (что является хорошим показателем)