### Задача идентификации взломщика по его поведению в сети Интернет

Ссылка: [Catch Me If You Can](https://www.kaggle.com/c/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
from scipy.sparse import lil_matrix
from sklearn.model_selection import KFold
import time


%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('data/train_sessions.csv')
test = pd.read_csv('data/test_sessions.csv')

In [3]:
idx_split = data.shape[0]
data = data.append(test, sort=False).reset_index(drop=True)

In [4]:
data.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,,,,,,,,,,,,,,,,,,,0.0
1,2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,3846.0,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0.0
2,3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,39.0,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0.0
3,4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,782.0,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0.0
4,5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,178.0,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336358 entries, 0 to 336357
Data columns (total 22 columns):
session_id    336358 non-null int64
site1         336358 non-null int64
time1         336358 non-null object
site2         331406 non-null float64
time2         331406 non-null object
site3         326994 non-null float64
time3         326994 non-null object
site4         323503 non-null float64
time4         323503 non-null object
site5         320170 non-null float64
time5         320170 non-null object
site6         317061 non-null float64
time6         317061 non-null object
site7         314137 non-null float64
time7         314137 non-null object
site8         311375 non-null float64
time8         311375 non-null object
site9         308568 non-null float64
time9         308568 non-null object
site10        305858 non-null float64
time10        305858 non-null object
target        253561 non-null float64
dtypes: float64(10), int64(2), object(10)
memory usage: 56.5+ MB


In [6]:
num_cols = [col for col in data.columns if data[col].dtype=='int64' or
                                           data[col].dtype=='float64']
num_cols.remove('target')
time_cols = [col for col in data.columns if data[col].dtype=='object']

In [7]:
for col in time_cols:
    data[col] = pd.to_datetime(data[col], yearfirst=True)

In [9]:
data[num_cols] = data[num_cols].fillna(-1)
data[num_cols] = data[num_cols].astype(int)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336358 entries, 0 to 336357
Data columns (total 22 columns):
session_id    336358 non-null int64
site1         336358 non-null int64
time1         336358 non-null datetime64[ns]
site2         336358 non-null int64
time2         331406 non-null datetime64[ns]
site3         336358 non-null int64
time3         326994 non-null datetime64[ns]
site4         336358 non-null int64
time4         323503 non-null datetime64[ns]
site5         336358 non-null int64
time5         320170 non-null datetime64[ns]
site6         336358 non-null int64
time6         317061 non-null datetime64[ns]
site7         336358 non-null int64
time7         314137 non-null datetime64[ns]
site8         336358 non-null int64
time8         311375 non-null datetime64[ns]
site9         336358 non-null int64
time9         308568 non-null datetime64[ns]
site10        336358 non-null int64
time10        305858 non-null datetime64[ns]
target        253561 non-null float64
dtype

In [11]:
data.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,0.0
1,2,890,2014-02-22 11:19:50,941,2014-02-22 11:19:50,3847,2014-02-22 11:19:51,941,2014-02-22 11:19:51,942,2014-02-22 11:19:51,3846,2014-02-22 11:19:51,3847,2014-02-22 11:19:52,3846,2014-02-22 11:19:52,1516,2014-02-22 11:20:15,1518,2014-02-22 11:20:16,0.0
2,3,14769,2013-12-16 16:40:17,39,2013-12-16 16:40:18,14768,2013-12-16 16:40:19,14769,2013-12-16 16:40:19,37,2013-12-16 16:40:19,39,2013-12-16 16:40:19,14768,2013-12-16 16:40:20,14768,2013-12-16 16:40:21,14768,2013-12-16 16:40:22,14768,2013-12-16 16:40:24,0.0
3,4,782,2014-03-28 10:52:12,782,2014-03-28 10:52:42,782,2014-03-28 10:53:12,782,2014-03-28 10:53:42,782,2014-03-28 10:54:12,782,2014-03-28 10:54:42,782,2014-03-28 10:55:12,782,2014-03-28 10:55:42,782,2014-03-28 10:56:12,782,2014-03-28 10:56:42,0.0
4,5,22,2014-02-28 10:53:05,177,2014-02-28 10:55:22,175,2014-02-28 10:55:22,178,2014-02-28 10:55:23,177,2014-02-28 10:55:23,178,2014-02-28 10:55:59,175,2014-02-28 10:55:59,177,2014-02-28 10:55:59,177,2014-02-28 10:57:06,178,2014-02-28 10:57:11,0.0


In [12]:
sites = [col for col in data.columns if col.startswith('site')]

In [13]:
def find_sites(li):
    lli = {}
    for l in li:
        if l > 0:
            if l in lli:
                lli[str(l)] += 1
            else:
                lli[str(l)] = 1
    return lli

In [14]:
data['all_sites'] = data[sites].apply(find_sites, axis=1)

In [15]:
data.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10,target,all_sites
0,1,718,2014-02-20 10:02:45,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,-1,NaT,0.0,{'718': 1}
1,2,890,2014-02-22 11:19:50,941,2014-02-22 11:19:50,3847,2014-02-22 11:19:51,941,2014-02-22 11:19:51,942,2014-02-22 11:19:51,3846,2014-02-22 11:19:51,3847,2014-02-22 11:19:52,3846,2014-02-22 11:19:52,1516,2014-02-22 11:20:15,1518,2014-02-22 11:20:16,0.0,"{'890': 1, '941': 1, '3847': 1, '942': 1, '384..."
2,3,14769,2013-12-16 16:40:17,39,2013-12-16 16:40:18,14768,2013-12-16 16:40:19,14769,2013-12-16 16:40:19,37,2013-12-16 16:40:19,39,2013-12-16 16:40:19,14768,2013-12-16 16:40:20,14768,2013-12-16 16:40:21,14768,2013-12-16 16:40:22,14768,2013-12-16 16:40:24,0.0,"{'14769': 1, '39': 1, '14768': 1, '37': 1}"
3,4,782,2014-03-28 10:52:12,782,2014-03-28 10:52:42,782,2014-03-28 10:53:12,782,2014-03-28 10:53:42,782,2014-03-28 10:54:12,782,2014-03-28 10:54:42,782,2014-03-28 10:55:12,782,2014-03-28 10:55:42,782,2014-03-28 10:56:12,782,2014-03-28 10:56:42,0.0,{'782': 1}
4,5,22,2014-02-28 10:53:05,177,2014-02-28 10:55:22,175,2014-02-28 10:55:22,178,2014-02-28 10:55:23,177,2014-02-28 10:55:23,178,2014-02-28 10:55:59,175,2014-02-28 10:55:59,177,2014-02-28 10:55:59,177,2014-02-28 10:57:06,178,2014-02-28 10:57:11,0.0,"{'22': 1, '177': 1, '175': 1, '178': 1}"


In [16]:
data['len_sites'] = data['all_sites'].apply(lambda x: sum(x.values()))

In [17]:
sp_list = list(data['all_sites'])

In [18]:
site1 = lil_matrix((data.shape[0], 100000))#, dtype=np.int8)
row = 0

for s in sp_list:
    for key, value in s.items():
        site1[row, key] = 1
    row+=1

site1 = site1.tocsc()[:, np.where(site1.getnnz(axis=0) > 0)[0]].tocsr()


In [19]:
site1

<336358x48371 sparse matrix of type '<class 'numpy.float64'>'
	with 1866898 stored elements in Compressed Sparse Row format>

In [20]:
ttest = site1[idx_split:]
site1 = site1.tocsc()[:, np.where((ttest.getnnz(axis=0) > 0))[0]].tocsr()

In [21]:
ttest = site1[idx_split:]
site1 = site1[:idx_split]

In [22]:
site1

<253561x15857 sparse matrix of type '<class 'numpy.float64'>'
	with 1320910 stored elements in Compressed Sparse Row format>

In [23]:
ttest

<82797x15857 sparse matrix of type '<class 'numpy.float64'>'
	with 437222 stored elements in Compressed Sparse Row format>

In [34]:
answ = []
v_metric = []

n=1

kf = KFold(n_splits=10, shuffle=True, random_state=777)   
for tr_ind, val_ind in kf.split(site1):
    print('Start {} fold'.format(n))

    val = site1[val_ind]
    ttt = site1[tr_ind] 

    start_time = time.time()
    clf = LogisticRegression(C=2, solver='lbfgs', max_iter=1000,
                            random_state=777)

    clf.fit(ttt, data['target'][tr_ind].reset_index(drop=True)) 

    model_pred_valid = clf.predict_proba(val)[:, 1]

    y_valid = data['target'][val_ind].reset_index(drop=True)
    valid_metric = auc(y_valid, model_pred_valid)
    v_metric.append(valid_metric)

    print('fold score:', valid_metric, round((time.time() - start_time)/60, 2))
    model_pred = clf.predict_proba(ttest)[:, 1]
    answ.append(model_pred)

    n+=1

    print('crossval score:', np.mean(v_metric), 'std', np.std(v_metric))
    print('---------------------------------------')

Start 1 fold
fold score: 0.9480986622860061 0.07
crossval score: 0.9480986622860061 std 0.0
---------------------------------------
Start 2 fold
fold score: 0.9530842963114712 0.07
crossval score: 0.9505914792987387 std 0.0024928170127325178
---------------------------------------
Start 3 fold
fold score: 0.9524013281515967 0.07
crossval score: 0.9511947622496914 std 0.0022069568211603393
---------------------------------------
Start 4 fold
fold score: 0.9469096565433298 0.07
crossval score: 0.9501234858231009 std 0.0026638118091100956
---------------------------------------
Start 5 fold
fold score: 0.9517503393330061 0.07
crossval score: 0.9504488565250819 std 0.0024698540560757908
---------------------------------------
Start 6 fold
fold score: 0.96164783832038 0.07
crossval score: 0.952315353490965 std 0.004743683923581689
---------------------------------------
Start 7 fold
fold score: 0.9556023994493321 0.07
crossval score: 0.9527849314850174 std 0.004539924003234503
-------------

In [25]:
answ_df = pd.DataFrame()
for i in range(len(answ)):
    answ_df['an'+str(i)] = answ[i]
answ_df['answer'] = answ_df.mean(axis=1)

In [26]:
answ_df.head()

Unnamed: 0,an0,an1,an2,an3,an4,an5,an6,an7,an8,an9,answer
0,0.001575534,0.002195588,0.001988172,0.001522793,0.001813855,0.001560352,0.00119645,0.001474735,0.001598547,0.001990004,0.001691603
1,0.0002423092,0.0002404369,0.0002431124,0.0002440007,0.0002451028,0.0002410179,0.0002459445,0.000242704,0.0002489614,0.0002515231,0.0002445113
2,0.0002568157,0.0002676014,0.0002577514,0.0002893732,0.0002649001,0.0002475553,0.0002539547,0.0002602539,0.0002610493,0.0002583748,0.000261763
3,2.107547e-07,2.445931e-07,2.247376e-07,2.775212e-07,2.485401e-07,2.766965e-07,2.302133e-07,1.962104e-07,1.874679e-07,1.897256e-07,2.28646e-07
4,2.339782e-05,2.91167e-05,2.25008e-06,2.02017e-05,2.868828e-05,2.605296e-05,2.440702e-05,2.533754e-05,2.569724e-05,2.569849e-05,2.308478e-05


In [28]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [29]:
y_test = np.array(answ_df['answer'])

In [30]:
write_to_submission_file(y_test, 'submission.csv')