# Catch Me If You Can
## Intruder Detection through Webpage Session Tracking
Будем решать задачу идентификации взломщика по его поведению в сети Интернет. В качестве примера, компания Яндекс решает задачу идентификации взломщика почтового ящика по его поведению. В двух словах, взломщик будет себя вести не так, как владелец ящика: он может не удалять сообщения сразу по прочтении, как это делал хозяин, он будет по-другому ставить флажки сообщениям и даже по-своему двигать мышкой. Тогда такого злоумышленника можно идентифицировать и "выкинуть" из почтового ящика, предложив хозяину войти по SMS-коду. 

В этом соревновании будем решать похожую задачу: алгоритм будет анализировать последовательность из нескольких веб-сайтов, посещенных подряд одним и тем же человеком, и определять, Элис это или взломщик (кто-то другой).

Данные собраны с прокси-серверов Университета Блеза Паскаля. "A Tool for Classification of Sequential Data", авторы Giacomo Kahn, Yannick Loiseau и Olivier Raynaud.

In [20]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, hstack
import datetime
from sklearn.preprocessing import StandardScaler

In [26]:
AUTHOR = 'Veronika_Shilova'
BEST_LOGIT_C = 1.6681005372000592

In [22]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [23]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='liblinear').fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [24]:
%time
# Read the training and test data sets, change paths if needed
train_df = pd.read_csv('C:/Data/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('C:/Data/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Wall time: 0 ns


Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [25]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [27]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [28]:
# Our target variable
y_train = train_df['target']

# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [29]:
# Dataframe with indices of visited websites in session
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55.0,,,,,,,,
54843,56,55.0,56.0,55.0,,,,,,
77292,946,946.0,951.0,946.0,946.0,945.0,948.0,784.0,949.0,946.0
114021,945,948.0,949.0,948.0,945.0,946.0,947.0,945.0,946.0,946.0
146670,947,950.0,948.0,947.0,950.0,952.0,946.0,951.0,946.0,947.0


In [30]:
time_split = TimeSeriesSplit(n_splits=10)

In [31]:
%%time
tfidf = TfidfVectorizer(ngram_range=(1,6), max_features=90000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = tfidf.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = tfidf.transform(inp_test_file)

Wall time: 42.1 s


In [32]:
X_train.shape, X_test.shape

((253561, 90000), (82797, 90000))

In [33]:
full_new_feat = pd.DataFrame(index=full_df.index)

full_new_feat['start_month'] = full_df['time1'].apply(lambda ts: 
                                                      100 * ts.year + ts.month).astype('float64')
full_new_feat['start_hour'] = full_df['time1'].apply(lambda ts: ts.hour).astype('float64')
full_new_feat['morning'] = full_new_feat['start_hour'].apply(
                                lambda x: 1 if x <=11 else 0).astype('float64')
full_new_feat['weekday'] = full_df['time1'].apply(pd.datetime.weekday)

hour = full_df['time1'].apply(lambda ts: ts.hour)
full_new_feat['morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
full_new_feat['day'] = ((hour >= 12) & (hour <= 18)).astype('int')
full_new_feat['evening'] = ((hour >= 19) & (hour <= 23)).astype('int')
full_new_feat['night'] = ((hour >= 0) & (hour <= 6)).astype('int')

full_new_feat['week_num'] = full_df['time1'].dt.week
full_new_feat['session_end_time'] = full_df[times].max(axis=1)
full_new_feat['session_duration'] = (full_new_feat['session_end_time'] - full_df['time1']).astype('timedelta64[s]')
full_new_feat['year'] = full_df['time1'].dt.year

full_new_feat['is_wednesday'] = full_df['time1'].apply(lambda ts: 1 if ts.date().weekday() == 2 else 0)

In [34]:
scaler = StandardScaler()
tmp = scaler.fit_transform(full_new_feat[['morning', 'day', 'evening', 'night',
                                          'weekday', 'start_month', 'session_duration']])

X_train_temp = csr_matrix(hstack([X_train, tmp[:idx_split,:]]))
print(get_auc_lr_valid(X_train_temp, y_train))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.9712083774135424


In [35]:
tmp = scaler.transform(full_new_feat[['morning', 'day', 'evening', 'night',
                                      'weekday', 'start_month', 'session_duration']])

X_test_temp = csr_matrix(hstack([X_test, tmp[idx_split:,:]]))

  


In [36]:
lr = LogisticRegression(C=1.0, random_state=17, solver='liblinear').fit(X_train_temp[:idx_split, :], y_train[:idx_split])

In [37]:
#c_values = np.logspace(-2, 2, 10)

c_values = [BEST_LOGIT_C]

logit_grid_searcher = GridSearchCV(estimator=lr, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split)

In [38]:
%%time
logit_grid_searcher.fit(X_train_temp, y_train)

Wall time: 2min 14s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [1.6681005372000592]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=0)

In [39]:
#logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_temp)[:, 1]
#write_to_submission_file(logit_test_pred3, 'subm_for_grade.csv')
test_pred = logit_grid_searcher.predict_proba(X_test_temp)[:, 1]
pred_df = pd.DataFrame(test_pred, index=np.arange(1, test_pred.shape[0] + 1),
                       columns=['target'])
pred_df.to_csv(f'submission_alice_{AUTHOR}.csv', index_label='session_id')

In [19]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9277175483249673, {'C': 1.6681005372000592})

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, hstack
import datetime
from sklearn.preprocessing import StandardScaler

AUTHOR = 'Veronika_Shilova'
time_split = TimeSeriesSplit(n_splits=10)
BEST_LOGIT_C = 1.6681005372000592

# Read the training and test data sets, change paths if needed
train_df = pd.read_csv('../input/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../input/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)
                       
# Our target variable
y_train = train_df['target']

# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

full_sites = full_df[sites]
tfidf = TfidfVectorizer(ngram_range=(1,6), max_features=90000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = tfidf.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = tfidf.transform(inp_test_file)

full_new_feat = pd.DataFrame(index=full_df.index)

full_new_feat['start_month'] = full_df['time1'].apply(lambda ts: 
                                                      100 * ts.year + ts.month).astype('float64')
hour = full_df['time1'].apply(lambda ts: ts.hour)
full_new_feat['morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
full_new_feat['day'] = ((hour >= 12) & (hour <= 18)).astype('int')
full_new_feat['evening'] = ((hour >= 19) & (hour <= 23)).astype('int')
full_new_feat['night'] = ((hour >= 0) & (hour <= 6)).astype('int')
full_new_feat['weekday'] = full_df['time1'].apply(pd.datetime.weekday)
full_new_feat['session_end_time'] = full_df[times].max(axis=1)
full_new_feat['session_duration'] = (full_new_feat['session_end_time'] - full_df['time1']).astype('timedelta64[s]')

scaler = StandardScaler()
tmp = scaler.fit_transform(full_new_feat[['morning', 'day', 'evening', 'night',
                                          'weekday', 'start_month', 'session_duration']])

X_train_temp = csr_matrix(hstack([X_train, tmp[:idx_split,:]]))

tmp = scaler.transform(full_new_feat[['morning', 'day', 'evening', 'night',
                                      'weekday', 'start_month', 'session_duration']])

X_test_temp = csr_matrix(hstack([X_test, tmp[idx_split:,:]]))

lr = LogisticRegression(C=1.0, random_state=17, solver='liblinear').fit(X_train_temp[:idx_split, :], y_train[:idx_split])

c_values = [BEST_LOGIT_C]

logit_grid_searcher = GridSearchCV(estimator=lr, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split)
                                  
logit_grid_searcher.fit(X_train_temp, y_train)

test_pred = logit_grid_searcher.predict_proba(X_test_temp)[:, 1]
pred_df = pd.DataFrame(test_pred, index=np.arange(1, test_pred.shape[0] + 1),
                       columns=['target'])
pred_df.to_csv(f'submission_alice_{AUTHOR}.csv', index_label='session_id')