### Задача идентификации взломщика по его поведению в сети Интернет

Ссылка: [Catch Me If You Can](https://www.kaggle.com/c/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2)

У нас есть данные по посещениям пользователями каких-то сайтов и времени посещения.

Необходимо определить сессии в тесте, которые осуществил определенный юзер. Его класс в трейне 1, все остальные юзеры 0.

В ноутбуке приведен алгоритм создания спарс матрицы из сайтов, которые посещали люди из выборки. В каждой строке будет от 1 до 10 непустых элементов.

По времени никаких фич не построено, это для самостоятельной работы.

In [147]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
from scipy.sparse import lil_matrix
from sklearn.model_selection import KFold
import time


%matplotlib inline
pd.set_option('display.max_columns', None)

In [148]:
data = pd.read_csv('./data/Alice/train_sessions.csv')
test = pd.read_csv('./data/Alice/test_sessions.csv')

In [149]:
#запоминаем индекс для последующего разделения трейна и теста
idx_split = data.shape[0]
#объединяем трейн и тест. Создание спарс матрицы в таком, как у нас, виде особо не ликует.
data = data.append(test, sort=False).reset_index(drop=True)

In [150]:
data.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,,,,,,,,,,,,,,,,,,,0.0
1,2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,3846.0,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0.0
2,3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,39.0,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0.0
3,4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,782.0,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0.0
4,5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,178.0,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0.0


In [151]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336358 entries, 0 to 336357
Data columns (total 22 columns):
session_id    336358 non-null int64
site1         336358 non-null int64
time1         336358 non-null object
site2         331406 non-null float64
time2         331406 non-null object
site3         326994 non-null float64
time3         326994 non-null object
site4         323503 non-null float64
time4         323503 non-null object
site5         320170 non-null float64
time5         320170 non-null object
site6         317061 non-null float64
time6         317061 non-null object
site7         314137 non-null float64
time7         314137 non-null object
site8         311375 non-null float64
time8         311375 non-null object
site9         308568 non-null float64
time9         308568 non-null object
site10        305858 non-null float64
time10        305858 non-null object
target        253561 non-null float64
dtypes: float64(10), int64(2), object(10)
memory usage: 56.5+ MB


In [152]:
num_cols = [col for col in data.columns if data[col].dtype=='int64' or
                                           data[col].dtype=='float64']
num_cols.remove('target')
time_cols = [col for col in data.columns if data[col].dtype=='object']

In [153]:
for col in time_cols:
    data[col] = pd.to_datetime(data[col], yearfirst=True)

In [154]:
for col in time_cols:
    #data[col + 'year'] = data[col].dt.year;
    #data[col + 'month'] = data[col].dt.month;
    #data[col + 'day'] = data[col].dt.day;
    data[col + 'hours'] = data[col].dt.hour;
    #data[col + 'minuts'] = data[col].dt.minute;
    data.drop(col, axis=1, inplace=True)
    

In [155]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336358 entries, 0 to 336357
Data columns (total 22 columns):
session_id     336358 non-null int64
site1          336358 non-null int64
site2          331406 non-null float64
site3          326994 non-null float64
site4          323503 non-null float64
site5          320170 non-null float64
site6          317061 non-null float64
site7          314137 non-null float64
site8          311375 non-null float64
site9          308568 non-null float64
site10         305858 non-null float64
target         253561 non-null float64
time1hours     336358 non-null int64
time2hours     331406 non-null float64
time3hours     326994 non-null float64
time4hours     323503 non-null float64
time5hours     320170 non-null float64
time6hours     317061 non-null float64
time7hours     314137 non-null float64
time8hours     311375 non-null float64
time9hours     308568 non-null float64
time10hours    305858 non-null float64
dtypes: float64(19), int64(3)
memory 

In [156]:
num_cols = [col for col in data.columns if data[col].dtype=='int64' or
                                           data[col].dtype=='float64']

#Заполним отсутствующие сайты уникальным значением.
data[num_cols] = data[num_cols].fillna(-1)
#Это необходимо для того, чтобы данные по сайтам привести к целочисленному типу.
data[num_cols] = data[num_cols].astype(int)

In [157]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336358 entries, 0 to 336357
Data columns (total 22 columns):
session_id     336358 non-null int64
site1          336358 non-null int64
site2          336358 non-null int64
site3          336358 non-null int64
site4          336358 non-null int64
site5          336358 non-null int64
site6          336358 non-null int64
site7          336358 non-null int64
site8          336358 non-null int64
site9          336358 non-null int64
site10         336358 non-null int64
target         336358 non-null int64
time1hours     336358 non-null int64
time2hours     336358 non-null int64
time3hours     336358 non-null int64
time4hours     336358 non-null int64
time5hours     336358 non-null int64
time6hours     336358 non-null int64
time7hours     336358 non-null int64
time8hours     336358 non-null int64
time9hours     336358 non-null int64
time10hours    336358 non-null int64
dtypes: int64(22)
memory usage: 56.5 MB


In [158]:
data.head()

Unnamed: 0,session_id,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target,time1hours,time2hours,time3hours,time4hours,time5hours,time6hours,time7hours,time8hours,time9hours,time10hours
0,1,718,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,10,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,2,890,941,3847,941,942,3846,3847,3846,1516,1518,0,11,11,11,11,11,11,11,11,11,11
2,3,14769,39,14768,14769,37,39,14768,14768,14768,14768,0,16,16,16,16,16,16,16,16,16,16
3,4,782,782,782,782,782,782,782,782,782,782,0,10,10,10,10,10,10,10,10,10,10
4,5,22,177,175,178,177,178,175,177,177,178,0,10,10,10,10,10,10,10,10,10,10


In [159]:
sites = [col for col in data.columns if col.startswith('site') or col.startswith('time')]

In [160]:
# Этой функцией создаем словарь посещенных юзерами сайтов
def find_sites(li):
    lli = {}
    for l in li:
        if l > 0:
            if l in lli:
                lli[str(l)] += 1
            else:
                lli[str(l)] = 1
    return lli

In [161]:
data['all_sites'] = data[sites].apply(find_sites, axis=1)

In [162]:
data.head()

Unnamed: 0,session_id,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target,time1hours,time2hours,time3hours,time4hours,time5hours,time6hours,time7hours,time8hours,time9hours,time10hours,all_sites
0,1,718,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,10,-1,-1,-1,-1,-1,-1,-1,-1,-1,"{'718': 1, '10': 1}"
1,2,890,941,3847,941,942,3846,3847,3846,1516,1518,0,11,11,11,11,11,11,11,11,11,11,"{'890': 1, '941': 1, '3847': 1, '942': 1, '384..."
2,3,14769,39,14768,14769,37,39,14768,14768,14768,14768,0,16,16,16,16,16,16,16,16,16,16,"{'14769': 1, '39': 1, '14768': 1, '37': 1, '16..."
3,4,782,782,782,782,782,782,782,782,782,782,0,10,10,10,10,10,10,10,10,10,10,"{'782': 1, '10': 1}"
4,5,22,177,175,178,177,178,175,177,177,178,0,10,10,10,10,10,10,10,10,10,10,"{'22': 1, '177': 1, '175': 1, '178': 1, '10': 1}"


In [163]:
#Считаем, сколько всего сайтов было пройдено за сессию
data['len_sites'] = data['all_sites'].apply(lambda x: sum(x.values()))

In [164]:
#для скорости создаем лист из наших словарей
sp_list = list(data['all_sites'])

In [165]:
data.shape[0]

336358

In [166]:
#инициализируем пустую спарс матрицу и задаем ее размер с запасом
site1 = lil_matrix((data.shape[0], 100000))#, dtype=np.int)
row = 0

#в цикле идем по каждой строке и ставим единичку в ту колонку, сайт которой есть в строке
for s in sp_list:
    for key, value in s.items():
        site1[row, int(key)] =  1
    row+=1

#убираем лишние нулевые колонки
site1 = site1.tocsc()[:, np.where(site1.getnnz(axis=0) > 0)[0]].tocsr()


In [167]:
#размер нашего спарса
site1

<336358x48371 sparse matrix of type '<class 'numpy.float64'>'
	with 2212187 stored elements in Compressed Sparse Row format>

In [168]:
#в данной ячейке мы удаляем все колонки с нулевыми значениями в колонках теста 
#и здесь мы ликуем, надеясь, что это даст выше скор
ttest = site1[idx_split:]
site1 = site1.tocsc()[:, np.where((ttest.getnnz(axis=0) > 0))[0]].tocsr()

In [169]:
#делим снова на тест и трейн
ttest = site1[idx_split:]
site1 = site1[:idx_split]

In [170]:
site1

<253561x15857 sparse matrix of type '<class 'numpy.float64'>'
	with 1581093 stored elements in Compressed Sparse Row format>

In [171]:
ttest

<82797x15857 sparse matrix of type '<class 'numpy.float64'>'
	with 522328 stored elements in Compressed Sparse Row format>

In [172]:
#обучаем с кросс-валидацией линейную регрессию
#предсказываем 10 раз трейн, дальше усредним
answ = []
v_metric = []

n=1

kf = KFold(n_splits=10, shuffle=True, random_state=777)   
for tr_ind, val_ind in kf.split(site1):
    print('Start {} fold'.format(n))

    val = site1[val_ind]
    ttt = site1[tr_ind] 

    start_time = time.time()
    clf = LogisticRegression(C=2, solver='lbfgs', max_iter=1000,
                            random_state=777)

    clf.fit(ttt, data['target'][tr_ind].reset_index(drop=True)) 

    model_pred_valid = clf.predict_proba(val)[:, 1]

    y_valid = data['target'][val_ind].reset_index(drop=True)
    valid_metric = auc(y_valid, model_pred_valid)
    v_metric.append(valid_metric)

    print('fold score:', valid_metric, round((time.time() - start_time)/60, 2))
    model_pred = clf.predict_proba(ttest)[:, 1]
    answ.append(model_pred)

    n+=1

    print('crossval score:', np.mean(v_metric), 'std', np.std(v_metric))
    print('---------------------------------------')

Start 1 fold
fold score: 0.97477861538875 0.1
crossval score: 0.97477861538875 std 0.0
---------------------------------------
Start 2 fold
fold score: 0.9757828070148816 0.09
crossval score: 0.9752807112018158 std 0.0005020958130658171
---------------------------------------
Start 3 fold
fold score: 0.9748486817962764 0.1
crossval score: 0.9751367013999693 std 0.0004577602535402202
---------------------------------------
Start 4 fold
fold score: 0.9801454240806045 0.08
crossval score: 0.9763888820701281 std 0.002204773827277707
---------------------------------------
Start 5 fold
fold score: 0.9745940162437716 0.09
crossval score: 0.9760299089048567 std 0.0020986350414254045
---------------------------------------
Start 6 fold
fold score: 0.980982519218647 0.09
crossval score: 0.9768553439571551 std 0.0026602517409817015
---------------------------------------
Start 7 fold
fold score: 0.9798982509134666 0.09
crossval score: 0.9772900449509139 std 0.002683232699011758
-----------------

In [173]:
#собираем предсказания теста
answ_df = pd.DataFrame()
for i in range(len(answ)):
    answ_df['an'+str(i)] = answ[i]
answ_df['answer'] = answ_df.mean(axis=1)

In [174]:
answ_df.head()

Unnamed: 0,an0,an1,an2,an3,an4,an5,an6,an7,an8,an9,answer
0,0.0001757254,0.0002409355,0.0001969017,0.0001728496,0.0001380436,0.0001676958,0.0001245776,0.0001563077,0.0001944676,0.0002212621,0.0001788767
1,5.08786e-05,4.553811e-05,4.409239e-05,4.990177e-05,4.370581e-05,3.88613e-05,4.845266e-05,4.634258e-05,4.706723e-05,5.074653e-05,4.65587e-05
2,4.838427e-05,4.125738e-05,4.276697e-05,4.939097e-05,4.651539e-05,4.001193e-05,4.481238e-05,5.306353e-05,5.013573e-05,5.368914e-05,4.700277e-05
3,1.365636e-08,1.263008e-08,1.066219e-08,1.464965e-08,1.352933e-08,1.498467e-08,1.010768e-08,9.379403e-09,1.314464e-08,1.049951e-08,1.232435e-08
4,2.78561e-06,2.659884e-06,2.040587e-07,2.420955e-06,2.437611e-06,1.989232e-06,2.427787e-06,2.945337e-06,2.786196e-06,2.947039e-06,2.360371e-06


In [175]:
#пишем функцию для сабмита и делаем сабмит

def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [176]:
y_test = np.array(answ_df['answer'])

In [177]:
write_to_submission_file(y_test, 'submission.csv')