### Задача идентификации взломщика по его поведению в сети Интернет

Ссылка: [Catch Me If You Can](https://www.kaggle.com/c/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2)

У нас есть данные по посещениям пользователями каких-то сайтов и времени посещения.

Необходимо определить сессии в тесте, которые осуществил определенный юзер. Его класс в трейне 1, все остальные юзеры 0.

В ноутбуке приведен алгоритм создания спарс матрицы из сайтов, которые посещали люди из выборки. В каждой строке будет от 1 до 10 непустых элементов.

По времени никаких фич не построено, это для самостоятельной работы.

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
from scipy.sparse import lil_matrix
from sklearn.model_selection import KFold
import time


%matplotlib inline
pd.set_option('display.max_columns', None)

In [11]:
data = pd.read_csv('./data/Alice/train_sessions.csv')
test = pd.read_csv('./data/Alice/test_sessions.csv')

In [12]:
#запоминаем индекс для последующего разделения трейна и теста
idx_split = data.shape[0]
#объединяем трейн и тест. Создание спарс матрицы в таком, как у нас, виде особо не ликует.
#data = data.append(test, sort=False).reset_index(drop=True)

In [13]:
data.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,,,,,,,,,,,,,,,,,,,0
1,2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,3846.0,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
2,3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,39.0,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
3,4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,782.0,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
4,5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,178.0,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 22 columns):
session_id    253561 non-null int64
site1         253561 non-null int64
time1         253561 non-null object
site2         250098 non-null float64
time2         250098 non-null object
site3         246919 non-null float64
time3         246919 non-null object
site4         244321 non-null float64
time4         244321 non-null object
site5         241829 non-null float64
time5         241829 non-null object
site6         239495 non-null float64
time6         239495 non-null object
site7         237297 non-null float64
time7         237297 non-null object
site8         235224 non-null float64
time8         235224 non-null object
site9         233084 non-null float64
time9         233084 non-null object
site10        231052 non-null float64
time10        231052 non-null object
target        253561 non-null int64
dtypes: float64(9), int64(3), object(10)
memory usage: 42.6+ MB


In [15]:
num_cols = [col for col in data.columns if data[col].dtype=='int64' or
                                           data[col].dtype=='float64']
num_cols.remove('target')
time_cols = [col for col in data.columns if data[col].dtype=='object']

In [16]:
for col in time_cols:
    data[col] = pd.to_datetime(data[col], yearfirst=True)

In [17]:
for col in time_cols:
    data[col + 'year'] = data[col].dt.year;
    data[col + 'month'] = data[col].dt.month;
    data[col + 'day'] = data[col].dt.day;
    data[col + 'hours'] = data[col].dt.hour;
    data[col + 'minuts'] = data[col].dt.minute;
    data.drop(col, axis=1, inplace=True)
    

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 62 columns):
session_id      253561 non-null int64
site1           253561 non-null int64
site2           250098 non-null float64
site3           246919 non-null float64
site4           244321 non-null float64
site5           241829 non-null float64
site6           239495 non-null float64
site7           237297 non-null float64
site8           235224 non-null float64
site9           233084 non-null float64
site10          231052 non-null float64
target          253561 non-null int64
time1year       253561 non-null int64
time1month      253561 non-null int64
time1day        253561 non-null int64
time1hours      253561 non-null int64
time1minuts     253561 non-null int64
time2year       250098 non-null float64
time2month      250098 non-null float64
time2day        250098 non-null float64
time2hours      250098 non-null float64
time2minuts     250098 non-null float64
time3year       246919 no

In [19]:
num_cols = [col for col in data.columns if data[col].dtype=='int64' or
                                           data[col].dtype=='float64']

#Заполним отсутствующие сайты уникальным значением.
data[num_cols] = data[num_cols].fillna(-1)
#Это необходимо для того, чтобы данные по сайтам привести к целочисленному типу.
data[num_cols] = data[num_cols].astype(int)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 62 columns):
session_id      253561 non-null int64
site1           253561 non-null int64
site2           253561 non-null int64
site3           253561 non-null int64
site4           253561 non-null int64
site5           253561 non-null int64
site6           253561 non-null int64
site7           253561 non-null int64
site8           253561 non-null int64
site9           253561 non-null int64
site10          253561 non-null int64
target          253561 non-null int64
time1year       253561 non-null int64
time1month      253561 non-null int64
time1day        253561 non-null int64
time1hours      253561 non-null int64
time1minuts     253561 non-null int64
time2year       253561 non-null int64
time2month      253561 non-null int64
time2day        253561 non-null int64
time2hours      253561 non-null int64
time2minuts     253561 non-null int64
time3year       253561 non-null int64
time3month     

In [21]:
data.head()

Unnamed: 0,session_id,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target,time1year,time1month,time1day,time1hours,time1minuts,time2year,time2month,time2day,time2hours,time2minuts,time3year,time3month,time3day,time3hours,time3minuts,time4year,time4month,time4day,time4hours,time4minuts,time5year,time5month,time5day,time5hours,time5minuts,time6year,time6month,time6day,time6hours,time6minuts,time7year,time7month,time7day,time7hours,time7minuts,time8year,time8month,time8day,time8hours,time8minuts,time9year,time9month,time9day,time9hours,time9minuts,time10year,time10month,time10day,time10hours,time10minuts
0,1,718,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,2014,2,20,10,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,2,890,941,3847,941,942,3846,3847,3846,1516,1518,0,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,20,2014,2,22,11,20
2,3,14769,39,14768,14769,37,39,14768,14768,14768,14768,0,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40
3,4,782,782,782,782,782,782,782,782,782,782,0,2014,3,28,10,52,2014,3,28,10,52,2014,3,28,10,53,2014,3,28,10,53,2014,3,28,10,54,2014,3,28,10,54,2014,3,28,10,55,2014,3,28,10,55,2014,3,28,10,56,2014,3,28,10,56
4,5,22,177,175,178,177,178,175,177,177,178,0,2014,2,28,10,53,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,57,2014,2,28,10,57


In [22]:
sites = [col for col in data.columns if col.startswith('site')]

In [23]:
# Этой функцией создаем словарь посещенных юзерами сайтов
def find_sites(li):
    lli = {}
    for l in li:
        if l > 0:
            if l in lli:
                lli[str(l)] += 1
            else:
                lli[str(l)] = 1
    return lli

In [27]:
data['all_sites'] = data[sites].apply(find_sites, axis=1)

In [28]:
data.head()

Unnamed: 0,session_id,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target,time1year,time1month,time1day,time1hours,time1minuts,time2year,time2month,time2day,time2hours,time2minuts,time3year,time3month,time3day,time3hours,time3minuts,time4year,time4month,time4day,time4hours,time4minuts,time5year,time5month,time5day,time5hours,time5minuts,time6year,time6month,time6day,time6hours,time6minuts,time7year,time7month,time7day,time7hours,time7minuts,time8year,time8month,time8day,time8hours,time8minuts,time9year,time9month,time9day,time9hours,time9minuts,time10year,time10month,time10day,time10hours,time10minuts,all_sites
0,1,718,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,2014,2,20,10,2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,{'718': 1}
1,2,890,941,3847,941,942,3846,3847,3846,1516,1518,0,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,19,2014,2,22,11,20,2014,2,22,11,20,"{'890': 1, '941': 1, '3847': 1, '942': 1, '384..."
2,3,14769,39,14768,14769,37,39,14768,14768,14768,14768,0,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,2013,12,16,16,40,"{'14769': 1, '39': 1, '14768': 1, '37': 1}"
3,4,782,782,782,782,782,782,782,782,782,782,0,2014,3,28,10,52,2014,3,28,10,52,2014,3,28,10,53,2014,3,28,10,53,2014,3,28,10,54,2014,3,28,10,54,2014,3,28,10,55,2014,3,28,10,55,2014,3,28,10,56,2014,3,28,10,56,{'782': 1}
4,5,22,177,175,178,177,178,175,177,177,178,0,2014,2,28,10,53,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,55,2014,2,28,10,57,2014,2,28,10,57,"{'22': 1, '177': 1, '175': 1, '178': 1}"


In [174]:
#Считаем, сколько всего сайтов было пройдено за сессию
data['len_sites'] = data['all_sites'].apply(lambda x: sum(x.values()))

In [175]:
#для скорости создаем лист из наших словарей
sp_list = list(data['all_sites'])

In [176]:
data.shape[0]

336358

In [177]:
#инициализируем пустую спарс матрицу и задаем ее размер с запасом
site1 = lil_matrix((data.shape[0], 100000))#, dtype=np.int)
row = 0

#в цикле идем по каждой строке и ставим единичку в ту колонку, сайт которой есть в строке
for s in sp_list:
    for key, value in s.items():
        site1[row, int(key)] =  1
    row+=1

#убираем лишние нулевые колонки
site1 = site1.tocsc()[:, np.where(site1.getnnz(axis=0) > 0)[0]].tocsr()


In [178]:
#размер нашего спарса
site1

<336358x48371 sparse matrix of type '<class 'numpy.float64'>'
	with 1866898 stored elements in Compressed Sparse Row format>

In [179]:
#в данной ячейке мы удаляем все колонки с нулевыми значениями в колонках теста 
#и здесь мы ликуем, надеясь, что это даст выше скор
ttest = site1[idx_split:]
site1 = site1.tocsc()[:, np.where((ttest.getnnz(axis=0) > 0))[0]].tocsr()

In [180]:
#делим снова на тест и трейн
ttest = site1[idx_split:]
site1 = site1[:idx_split]

In [181]:
site1

<253561x15857 sparse matrix of type '<class 'numpy.float64'>'
	with 1320910 stored elements in Compressed Sparse Row format>

In [182]:
ttest

<82797x15857 sparse matrix of type '<class 'numpy.float64'>'
	with 437222 stored elements in Compressed Sparse Row format>

In [202]:
#обучаем с кросс-валидацией линейную регрессию
#предсказываем 10 раз трейн, дальше усредним
answ = []
v_metric = []

n=1

kf = KFold(n_splits=10, shuffle=True, random_state=777)   
for tr_ind, val_ind in kf.split(site1):
    print('Start {} fold'.format(n))

    val = site1[val_ind]
    ttt = site1[tr_ind] 

    start_time = time.time()
    clf = LogisticRegression(C=2, solver='lbfgs', max_iter=1000,
                            random_state=777)

    clf.fit(ttt, data['target'][tr_ind].reset_index(drop=True)) 

    model_pred_valid = clf.predict_proba(val)[:, 1]

    y_valid = data['target'][val_ind].reset_index(drop=True)
    valid_metric = auc(y_valid, model_pred_valid)
    v_metric.append(valid_metric)

    print('fold score:', valid_metric, round((time.time() - start_time)/60, 2))
    model_pred = clf.predict_proba(ttest)[:, 1]
    answ.append(model_pred)

    n+=1

    print('crossval score:', np.mean(v_metric), 'std', np.std(v_metric))
    print('---------------------------------------')

Start 1 fold
fold score: 0.9480954708538257 0.1
crossval score: 0.9480954708538257 std 0.0
---------------------------------------
Start 2 fold
fold score: 0.9530842963114713 0.11
crossval score: 0.9505898835826485 std 0.0024944127288227813
---------------------------------------
Start 3 fold
fold score: 0.9524025661091435 0.1
crossval score: 0.9511941110914801 std 0.0022086749434898608
---------------------------------------
Start 4 fold
fold score: 0.9469098381962735 0.12
crossval score: 0.9501230428676785 std 0.0026646285246940053
---------------------------------------
Start 5 fold
fold score: 0.9517492782265373 0.11
crossval score: 0.9504482899394503 std 0.0024704936289079524
---------------------------------------
Start 6 fold
fold score: 0.9616495990759554 0.1
crossval score: 0.9523151747955345 std 0.004744724541155643
---------------------------------------
Start 7 fold
fold score: 0.955600303325275 0.09
crossval score: 0.9527844788712116 std 0.004540686088458634
--------------

In [203]:
#собираем предсказания теста
answ_df = pd.DataFrame()
for i in range(len(answ)):
    answ_df['an'+str(i)] = answ[i]
answ_df['answer'] = answ_df.mean(axis=1)

In [186]:
answ_df.head()

Unnamed: 0,an0,an1,an2,an3,an4,an5,an6,an7,an8,an9,answer
0,0.001575812,0.002195627,0.001991421,0.001522242,0.001813737,0.001561588,0.001195627,0.001474188,0.001599182,0.001990622,0.001692005
1,0.0002422503,0.0002404381,0.0002432135,0.0002438401,0.0002451519,0.0002412053,0.0002458759,0.0002428971,0.0002491148,0.0002514968,0.0002445484
2,0.000256653,0.0002676373,0.0002577495,0.0002892829,0.0002649286,0.000247694,0.0002539574,0.0002603931,0.0002612001,0.000258314,0.000261781
3,2.107645e-07,2.447426e-07,2.24457e-07,2.773792e-07,2.486528e-07,2.76782e-07,2.305281e-07,1.957439e-07,1.870675e-07,1.895934e-07,2.285711e-07
4,2.338125e-05,2.911579e-05,2.250726e-06,2.020594e-05,2.86864e-05,2.605439e-05,2.44197e-05,2.531338e-05,2.565969e-05,2.569958e-05,2.307868e-05


In [187]:
#пишем функцию для сабмита и делаем сабмит

def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [188]:
y_test = np.array(answ_df['answer'])

In [189]:
write_to_submission_file(y_test, 'submission.csv')