In [11]:
# from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
import os
import pickle
import numpy as np
import pandas as pd
# from scipy.sparse import csr_matrix
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from scipy.sparse import hstack
import eli5
import calendar

In [None]:
# !unzip capstone_user_identification.zip

### Обработка входных данных

In [12]:
PATH_TO_DATA = '../capstone_user_identification'
# PATH_TO_DATA = '../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2'

times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id', parse_dates=times)
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id', parse_dates=times)
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [14]:
with open(os.path.join(PATH_TO_DATA, 'site_dic.pkl'), 'rb') as f:
    site2id  = pickle.load(f)
id2site = {v:k for (k, v) in site2id.items()}
id2site[0] = 'unknown'

In [15]:
train_df.sort_values(by='time1', inplace=True)

In [16]:
train_df_sites = train_df[sites].fillna(0).astype('int').apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).tolist()
test_df_sites = test_df[sites].fillna(0).astype('int').apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).tolist()

In [125]:
vectorizer_params = {
    'ngram_range': (1, 5),
    'max_features': 50000,
    'tokenizer': lambda s: s.split()
}

In [126]:
%%time
vectorizer = TfidfVectorizer(**vectorizer_params)
# vectorizer = CountVectorizer(**vectorizer_params)
X_train_sparse = vectorizer.fit_transform(train_df_sites)
X_test_sparse = vectorizer.transform(test_df_sites)
y = train_df['target']
X_train_sparse.shape, X_test_sparse.shape

CPU times: user 28.3 s, sys: 900 ms, total: 29.2 s
Wall time: 29.2 s


((253561, 50000), (82797, 50000))

In [127]:
scaler = StandardScaler()
duration_train = (train_df[times].max(axis=1) - train_df[times].min(axis=1))\
                    .astype('timedelta64[ms]').astype(int).values.reshape(-1, 1)
# number_of_sites = train_df[times].isnull().sum(axis=1).apply(lambda x: 10 - x).values.reshape(-1, 1)
# time_per_site = (duration_train / number_of_sites)
# scaler.fit(np.concatenate((duration_train, time_per_site), axis=1))
scaler.fit(duration_train)

StandardScaler()

In [128]:
def features(df):
    # intraday features
    hour = df['time1'].apply(lambda ts: ts.hour).values
    morning = ((hour >= 7) & (hour <= 11)).astype('int').reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').reshape(-1, 1)
#     night = ((hour >= 0) & (hour <= 6)).astype('int').reshape(-1, 1)
    
    # week features
    weekday = df.time1.apply(lambda ts: ts.weekday()).values.reshape(-1, 1)
#     is_monday = np.isin(weekday, [0])
#     is_wednesday = np.isin(weekday, [2])
#     is_sunday = np.isin(weekday, [6])
#     is_weekend = np.isin(weekday, [0, 2, 6])
#     weekdays = pd.get_dummies(weekday.flatten()).to_numpy().reshape(7, -1, 1)
    
    # month features
    month = df.time1.apply(lambda ts: ts.month).values.reshape(-1, 1)

    # year features
    year = df['time1'].apply(lambda t: 100 * t.year).values.reshape(-1, 1) / 1e5
    year_month = df['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5

    # session features
    duration = (df[times].max(axis=1) - df[times].min(axis=1))\
                    .astype('timedelta64[ms]').astype(int).values.reshape(-1, 1)
#     unique_sites = df[sites].apply(lambda sites: sites.nunique(), axis=1).values.reshape(-1, 1)
#     number_of_sites = df[times].isnull().sum(axis=1).apply(lambda x: 10 - x).values.reshape(-1, 1)
#     time_per_site = (duration / number_of_sites)
    duration = scaler.transform(duration)
#     scaled = scaler.transform(np.concatenate((duration, time_per_site), axis=1)).reshape(2, -1, 1)
    
    # 
    feature_names =  ['morning', 'day', 'evening', 'year', 'year_month', 'weekday', 'month', 
                      'duration'] # + list(calendar.day_name)
    result = np.array([morning, day, evening, year, year_month, weekday, month, duration]) # is_wednesday, is_monday, is_sunday
#     result = np.concatenate((result, scaled), axis=0)
    
    return result, feature_names

In [129]:
%%time
X_train_features, feature_names = features(train_df)
X_test_features, _ = features(test_df)
X_train = hstack([X_train_sparse, *X_train_features])
X_test = hstack([X_test_sparse, *X_test_features])

CPU times: user 12.5 s, sys: 20.1 ms, total: 12.5 s
Wall time: 12.5 s


In [131]:
pd.DataFrame(np.moveaxis(X_train_features.squeeze(2), 1, 0), columns=feature_names)

Unnamed: 0,morning,day,evening,year,year_month,weekday,month,duration
0,1.0,0.0,0.0,2.013,2.01301,5.0,1.0,-0.468233
1,1.0,0.0,0.0,2.013,2.01301,5.0,1.0,5.570015
2,1.0,0.0,0.0,2.013,2.01301,5.0,1.0,-0.454709
3,1.0,0.0,0.0,2.013,2.01301,5.0,1.0,-0.458090
4,1.0,0.0,0.0,2.013,2.01301,5.0,1.0,-0.461471
...,...,...,...,...,...,...,...,...
253556,0.0,0.0,1.0,2.014,2.01404,2.0,4.0,-0.427662
253557,0.0,0.0,1.0,2.014,2.01404,2.0,4.0,0.133564
253558,0.0,0.0,1.0,2.014,2.01404,2.0,4.0,-0.373568
253559,0.0,0.0,1.0,2.014,2.01404,2.0,4.0,-0.207905


### Обучение модели

In [132]:
model = SGDClassifier(loss='log', random_state=17, n_jobs=-1)

In [67]:
grid = {
    'penalty': ['l2'],
    'alpha': np.linspace(3e-05, 5e-05, 11)
}

In [68]:
%%time
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(model, param_grid=grid, cv=time_split, scoring='roc_auc')
search.fit(X_train, y)

CPU times: user 3min 50s, sys: 3.67 s, total: 3min 53s
Wall time: 1min


GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
             estimator=SGDClassifier(loss='log', n_jobs=-1, random_state=17),
             param_grid={'alpha': array([3.0e-05, 3.2e-05, 3.4e-05, 3.6e-05, 3.8e-05, 4.0e-05, 4.2e-05,
       4.4e-05, 4.6e-05, 4.8e-05, 5.0e-05]),
                         'penalty': ['l2']},
             scoring='roc_auc')

In [141]:
search.best_params_, search.best_score_

({'alpha': 3.6e-05, 'penalty': 'l2'}, 0.9225028331780019)

In [133]:
%%time
time_split = TimeSeriesSplit(n_splits=10)

cv_scores = cross_val_score(search.best_estimator_, X_train, y, cv=time_split, scoring='roc_auc', n_jobs=-1)

CPU times: user 357 ms, sys: 841 ms, total: 1.2 s
Wall time: 4 s


In [134]:
cv_scores, cv_scores.mean(), cv_scores.std()

(array([0.72891738, 0.81720851, 0.9214645 , 0.95459164, 0.90041319,
        0.96569335, 0.91293948, 0.958499  , 0.93949734, 0.96538048]),
 0.9064604856501518,
 0.07272364437910625)

0.9315069358401142,
 0.044345068769381366 0 2 6

0.9180564468510812,
 0.05889198351691508 baseline

0.9232962337585114,
 0.058257317090901546

0.9287111177172779,
 0.04434090003756869 2

In [136]:
search.best_estimator_.fit(X_train, y)
logit_test_pred_proba = search.best_estimator_.predict_proba(X_test)

In [137]:
pd.DataFrame({'feature': feature_names, 'coef': search.best_estimator_.coef_.flatten()[-len(feature_names):]})

Unnamed: 0,feature,coef
0,morning,-1.674105
1,day,1.807669
2,evening,-0.81907
3,year,-1.386719
4,year_month,-1.386718
5,weekday,-0.443467
6,month,0.085334
7,duration,-0.504429


In [138]:
eli5.show_weights(estimator=search.best_estimator_, feature_names=vectorizer.get_feature_names() + feature_names, top=30)

Weight?,Feature
+3.319,vk.com
+3.161,i1.ytimg.com
+2.846,youwatch.org
+2.536,www.melty.fr
+2.463,www.info-jeunes.net
+2.379,www.audienceinsights.net
+2.364,www.youtube.com
+2.097,r4---sn-gxo5uxg-jqbe.googlevideo.com
+1.886,r1---sn-gxo5uxg-jqbe.googlevideo.com
+1.808,day


In [139]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                               
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [140]:
write_to_submission_file(logit_test_pred_proba[:, 1], 'result.csv')