sgdclassifier | bag of sites | 0.83341

In [222]:
# from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
import os
import pickle
import numpy as np
import pandas as pd
# from scipy.sparse import csr_matrix
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from scipy.sparse import hstack
import eli5
import calendar

In [None]:
# !unzip capstone_user_identification.zip

### Обработка входных данных

In [50]:
# PATH_TO_DATA = '../capstone_user_identification'
PATH_TO_DATA = '../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2'

times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id', parse_dates=times)
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id', parse_dates=times)
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [51]:
with open(os.path.join(PATH_TO_DATA, 'site_dic.pkl'), 'rb') as f:
    site2id  = pickle.load(f)
id2site = {v:k for (k, v) in site2id.items()}
id2site[0] = 'unknown'

In [52]:
train_df.sort_values(by='time1', inplace=True)

In [56]:
train_df_sites = train_df[sites].fillna(0).astype('int').apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).tolist()
test_df_sites = test_df[sites].fillna(0).astype('int').apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).tolist()

In [100]:
vectorizer_params = {
    'ngram_range': (1, 2), 
#     'max_features': 50000,
    'tokenizer': lambda s: s.split()
}

In [101]:
%%time
# vectorizer = TfidfVectorizer(**vectorizer_params)
vectorizer = CountVectorizer(**vectorizer_params)
X_train_sparse = vectorizer.fit_transform(train_df_sites)
X_test_sparse = vectorizer.transform(test_df_sites)
y = train_df['target']
X_train_sparse.shape, X_test_sparse.shape

CPU times: user 10.2 s, sys: 61.3 ms, total: 10.3 s
Wall time: 10.3 s


((253561, 363770), (82797, 363770))

In [102]:
scaler = StandardScaler()
duration_train = (train_df[times].max(axis=1) - train_df[times].min(axis=1))\
                    .astype('timedelta64[ms]').astype(int).values.reshape(-1, 1)
scaler.fit(duration_train)

StandardScaler()

In [276]:
def features(df):
    # intraday features
    hour = df['time1'].apply(lambda ts: ts.hour).values
    morning = ((hour >= 7) & (hour <= 11)).astype('int').reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').reshape(-1, 1)
#     night = ((hour >= 0) & (hour <= 6)).astype('int').reshape(-1, 1)
    
    # week features
    weekday = df.time1.apply(lambda ts: ts.weekday()).values.reshape(-1, 1)
#     is_monday = np.isin(weekday, [0])
#     is_wednesday = np.isin(weekday, [2])
#     is_sunday = np.isin(weekday, [6])
#     is_weekend = np.isin(weekday, [0, 2, 6])
#     weekdays = pd.get_dummies(weekday.flatten()).to_numpy().reshape(7, -1, 1)
    
    # month features
#     month = df.time1.apply(lambda ts: ts.month).values.reshape(-1, 1)

    # year features
    year = df['time1'].apply(lambda t: 100 * t.year).values.reshape(-1, 1) / 1e5
#     year_month = df['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5

    # session features
    duration = (df[times].max(axis=1) - df[times].min(axis=1))\
                    .astype('timedelta64[ms]').astype(int).values.reshape(-1, 1)
    duration = scaler.transform(duration)
#     unique_sites = df[sites].apply(lambda sites: sites.nunique(), axis=1).values.reshape(-1, 1)
    
    # 
    feature_names =  ['morning', 'day', 'evening', 'year', 'weekday', 'duration'] # + list(calendar.day_name)
    result = np.array([morning, day, evening, year, weekday, duration]) # is_wednesday, is_monday, is_sunday
#     result = np.concatenate((result, weekdays), axis=0)
    
    return result, feature_names

In [277]:
%%time
X_train_features, feature_names = features(train_df)
X_test_features, _ = features(test_df)
X_train = hstack([X_train_sparse, *X_train_features])
X_test = hstack([X_test_sparse, *X_test_features])

CPU times: user 8.76 s, sys: 147 ms, total: 8.91 s
Wall time: 8.92 s


### Обучение модели

In [268]:
model = SGDClassifier(loss='log', random_state=17, n_jobs=-1)

In [280]:
grid = {
    'penalty': ['l1', 'l2'],
    'alpha': np.logspace(-1, 2, 4)
}

In [289]:
%%time
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(model, param_grid=grid, cv=time_split, scoring='roc_auc')
search.fit(X_train, y)

CPU times: user 45.5 s, sys: 20.5 ms, total: 45.5 s
Wall time: 42.9 s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             estimator=SGDClassifier(loss='log', n_jobs=-1, random_state=17),
             param_grid={'alpha': array([  0.1,   1. ,  10. , 100. ]),
                         'penalty': ['l1', 'l2']},
             scoring='roc_auc')

In [290]:
search.best_params_

{'alpha': 0.1, 'penalty': 'l2'}

In [291]:
search.best_score_

0.7067298688922502

In [285]:
%%time
time_split = TimeSeriesSplit(n_splits=10)

cv_scores = cross_val_score(search.best_estimator_, X_train, y, cv=time_split, scoring='roc_auc', n_jobs=-1)

CPU times: user 361 ms, sys: 299 ms, total: 660 ms
Wall time: 5.64 s


In [286]:
cv_scores, cv_scores.mean(), cv_scores.std()

(array([0.46242008, 0.49133259, 0.59190528, 0.74933785, 0.65257027,
        0.83426146, 0.32625859, 0.88772218, 0.74440288, 0.84770174]),
 0.6587912916122842,
 0.17779449423431953)

0.9315069358401142,
 0.044345068769381366 0 2 6

0.9180564468510812,
 0.05889198351691508 baseline

0.9232962337585114,
 0.058257317090901546

0.9287111177172779,
 0.04434090003756869 2

In [271]:
model.fit(X_train, y)
logit_test_pred_proba = model.predict_proba(X_test)

In [272]:
pd.DataFrame({'feature': feature_names, 'coef': model.coef_.flatten()[-len(feature_names):]})

Unnamed: 0,feature,coef
0,morning,-1.912632
1,day,1.139276
2,evening,-0.493524
3,year,-2.564895
4,weekday,-0.429454
5,duration,-0.188917
6,unique_sites,0.071069


In [244]:
eli5.show_weights(estimator=model, feature_names=vectorizer.get_feature_names() + feature_names, top=30)

Weight?,Feature
+1.201,youwatch.org
+1.142,day
+1.045,is_monday
+0.857,vk.com
+0.798,cid-ed6c3e6a5c6608a4.users.storage.live.com
+0.783,www.audienceinsights.net
+0.744,www.melty.fr
+0.718,fr.glee.wikia.com
+0.699,www.banque-chalus.fr
+0.655,www.video.tt


In [245]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                               
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [246]:
write_to_submission_file(logit_test_pred_proba[:, 1], 'result.csv')