In [1]:
# from scipy.sparse import csr_matrix

### Входные данные

#### Загрузка

In [2]:
# !unzip capstone_user_identification.zip

In [3]:
# from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import pickle

In [4]:
# PATH_TO_DATA = '../capstone_user_identification'
PATH_TO_DATA = '../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2'

times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]
df_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id', parse_dates=times)
df_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id', parse_dates=times)
df_train.shape, df_test.shape

((253561, 21), (82797, 20))

Словарь сайтов

In [5]:
with open(os.path.join(PATH_TO_DATA, 'site_dic.pkl'), 'rb') as f:
    site2id  = pickle.load(f)
id2site = {v:k for (k, v) in site2id.items()}
id2site[0] = 'unknown'

#### Обработка

In [6]:
df_train.sort_values(by='time1', inplace=True)

y_train = df_train["target"].astype('int').values

### Feature engineering

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

import numpy as np

#### Sites

In [8]:
class SiteFeaturesPreparator(BaseEstimator, TransformerMixin):
    """
    Fill NaN with zero values;
    Prepare a (Count)Vectorizer friendly 2D-list from data.
    """
    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1, 11)]
        # Convert dataframe rows to strings
#         return X[sites].fillna(0).astype('int')#.apply(lambda row: ' '.join([self.id2site[i] for i in row]), axis=1).tolist()
        return X[sites].fillna(0).astype('int').apply(lambda row: ' '.join([str(site_id) for site_id in row]), axis=1).tolist()

In [9]:
vectorizer_pipeline = Pipeline([
    ("preparator", SiteFeaturesPreparator()),
    ("vectorizer", CountVectorizer(ngram_range=(1, 2), max_features=20000))  # tokenizer=lambda s: s.split()
#     ("vectorizer", TfidfVectorizer(ngram_range=(1, 10), max_features=70000))
])

In [10]:
%%time
sites_train_vectorized = vectorizer_pipeline.fit_transform(df_train)
sites_train_vectorized

CPU times: user 7.98 s, sys: 147 ms, total: 8.13 s
Wall time: 8.13 s


<253561x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 2498250 stored elements in Compressed Sparse Row format>

In [11]:
# feature_names = ['morning', 'day', 'evening', 'weekday', 'year', 'session_duration']
# + list(calendar.day_name)

#### Times

In [64]:
class TimesFeaturesAdder(BaseEstimator, TransformerMixin):
    """
    Add new attributes to training and test set.
    """ 
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        # intraday features
        hour = X['time1'].apply(lambda ts: ts.hour)
        minutes = X['time1'].apply(lambda ts: ts.minute)
        hour_minutes = hour + minutes / 60
        dummy_hours = pd.get_dummies(hour)
        
        morning = ((hour >= 7) & (hour <= 11)).astype('int')
        day = ((hour >= 12) & (hour <= 18)).astype('int')
        evening = ((hour >= 19) & (hour <= 23)).astype('int')
        
        sin_hour = np.sin(2 * np.pi * hour / 24)
        cos_hour = np.cos(2 * np.pi * hour / 24)
        
        # date
        month = X['time1'].apply(lambda ts: ts.month)
        year = X['time1'].apply(lambda ts: ts.year)
        year_month = year + month
        
        features_df = pd.DataFrame({
            'hour': hour,
            'morning': morning,
            'day': day,
            'evening': evening,
            'sin_hour': sin_hour,
            'cos_hour': cos_hour,
            'minutes': minutes,
            'hour_minutes': hour_minutes,
            'month': month,
            'year': year,
            'year_month': year_month,
        })

        return pd.concat([features_df, dummy_hours], axis=1)
    
all_features = ['hour', 'morning', 'day', 'evening', 'sin_hour', 'cos_hour', 
                'minutes', 'hour_minutes', 'month', 'year', 'year_month'] + list(range(7, 24))

In [65]:
%%time
time_features_pipeline = Pipeline([
    ("times_adder", TimesFeaturesAdder()),
    ("scaler", StandardScaler())
])

times_train = time_features_pipeline.fit_transform(df_train)
times_train = pd.DataFrame(times_train, columns=all_features)
times_train

CPU times: user 4.99 s, sys: 145 ms, total: 5.13 s
Wall time: 5.13 s


Unnamed: 0,hour,morning,day,evening,sin_hour,cos_hour,minutes,hour_minutes,month,year,...,14,15,16,17,18,19,20,21,22,23
0,-1.357366,1.039061,-0.98128,-0.171577,1.406483,0.644886,-1.400885,-1.487757,-0.943567,-1.547749,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,-0.072862
1,-1.357366,1.039061,-0.98128,-0.171577,1.406483,0.644886,0.451667,-1.318638,-0.943567,-1.547749,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,-0.072862
2,-1.357366,1.039061,-0.98128,-0.171577,1.406483,0.644886,1.204266,-1.249933,-0.943567,-1.547749,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,-0.072862
3,-1.357366,1.039061,-0.98128,-0.171577,1.406483,0.644886,1.204266,-1.249933,-0.943567,-1.547749,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,-0.072862
4,-1.357366,1.039061,-0.98128,-0.171577,1.406483,0.644886,1.204266,-1.249933,-0.943567,-1.547749,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,-0.072862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253556,3.390349,-0.962408,-0.98128,5.828302,-0.399203,5.131687,0.220098,3.416696,-0.137395,0.646100,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,13.724647
253557,3.390349,-0.962408,-0.98128,5.828302,-0.399203,5.131687,0.277990,3.421981,-0.137395,0.646100,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,13.724647
253558,3.390349,-0.962408,-0.98128,5.828302,-0.399203,5.131687,0.509559,3.443121,-0.137395,0.646100,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,13.724647
253559,3.390349,-0.962408,-0.98128,5.828302,-0.399203,5.131687,0.509559,3.443121,-0.137395,0.646100,...,-0.3474,-0.305463,-0.28249,-0.230859,-0.124952,-0.07817,-0.068957,-0.082278,-0.076284,13.724647


In [None]:
#         # season features
#         summer = ((month >= 6) & (month <= 8)).astype('int')
        
#         # day of the week features
#         weekday = X['time1'].apply(lambda ts: ts.weekday()).astype('int')
#         #     is_monday = np.isin(weekday, [0])
#         #     is_wednesday = np.isin(weekday, [2])
#         #     is_sunday = np.isin(weekday, [6])
#         #     is_weekend = np.isin(weekday, [0, 2, 6])
#         #     weekdays = pd.get_dummies(weekday.flatten()).to_numpy().reshape(7, -1, 1)
        
#         session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype(int) #** 0.2

In [None]:
# class ScaledAttributesAdder(BaseEstimator, TransformerMixin):
#     """
#     Add new features, that should be scaled.
#     """
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X, y=None):
#         # session time features
#         times = ['time%s' % i for i in range(1, 11)]
#         sites = ['site%s' % i for i in range(1, 11)]
        
#         # session duration: take to the power of 1/5 to normalize the distribution
#         session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype(int) #** 0.2
        
#         # number of sites visited in a session
#         number_of_sites = X[times].isnull().sum(axis=1).apply(lambda x: 10 - x)
        
#         # average time spent on one site during a session
#         time_per_site = (session_duration / number_of_sites) #** 0.2
        
# #         unique_sites = X[sites].apply(lambda sites: sites.nunique(), axis=1)
        
#         X = np.c_[session_duration.values]
#         return X

In [None]:
# scaled_attributes_pipeline = Pipeline([
#     ("adder", ScaledAttributesAdder()),
#     ("scaler", StandardScaler())
# ])

# feature_pipeline = FeatureUnion(transformer_list=[
#     ('attributes_pipeline', attributes_pipeline),
#     ('scaled_attributes_pipeline', scaled_attributes_pipeline)
# ])

### Feature selection

In [22]:
from scipy.sparse import hstack

##############################################################

In [68]:
selected_features = ['morning', 'day', 'evening', 14] #+ list(range(7, 23))

X_train = hstack([sites_train_vectorized, times_train[selected_features]])
X_train

<253561x20004 sparse matrix of type '<class 'numpy.float64'>'
	with 3512494 stored elements in COOrdinate format>

##############################################################

### Модель

In [24]:
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

#### Обучение и подбор гиперпараметров

In [25]:
model = SGDClassifier(loss='log', random_state=2179, n_jobs=-1)
time_split = TimeSeriesSplit(n_splits=10)

In [26]:
# grid = {
#     'penalty': ['l2'],
#     'alpha': np.linspace(3e-05, 5e-05, 11)
# }

In [23]:
# %%time

# search = GridSearchCV(model, param_grid=grid, cv=time_split, scoring='roc_auc')
# search.fit(X_train, y_train)

In [24]:
# search.best_params_, search.best_score_

##############################################################

In [69]:
%%time

cv_scores = cross_val_score(model, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)
print(cv_scores.mean(), cv_scores.std())
cv_scores

0.9323758313310435 0.036240728279154644
CPU times: user 164 ms, sys: 39.4 ms, total: 203 ms
Wall time: 1.86 s


array([0.90759192, 0.83555746, 0.94217808, 0.94683687, 0.92353782,
       0.95534013, 0.9556413 , 0.93516266, 0.96607876, 0.95583332])

##############################################################

Опции:

- CountVectorizer ngram_range max_features
    - (1, 1)
        - max-k: 0.8530563468012738, 0.08925871896536618
        - 20000: 0.8531100483866421, 0.08919480787349321
        - 10000: 0.8539501616208991, 0.08875887000416063
        - 5000: 0.8529069996789713, 0.08888832394047236
    - (1, 2)
        - max-k: 0.8577666221636842, 0.09371826647019514
        - 50000: 0.858414628277273, 0.09308624515067462
        - 20000: 0.8600043484189314, 0.0901820258132247
        - 10000: 0.8565645409290898, 0.09086786443064755
    - (1, 3)
        - max-k: 0.8560039171386127, 0.09631952839070929
        - 50000: 0.859467798898789, 0.09245656399159023
        - 20000: 0.8598701871410264 0.09139402713364686
        - 10000: 0.8560423530991228 0.09031323488865993
    - (1, 4)
        - max-k: 0.8533973079027204 0.10052454183974614
        - 20000: 0.8455330939583249 0.08401555400318593
    - (1, 9)
        - 70000: .8604293420872524 0.09704219048563271
    - (1, 10)
        - 20000: 0.8419623744960409 0.0864989715893941
        - 50000: 0.859496836616332 0.09749267442904218
        - 70000: 0.8604687084516304 0.0970295359566777
        - 100000: 0.8596446298744558 0.09781978585361331
- TfidfVectorizer ngram_range max_features
    - (1, 1)
        - max-k: 0.8391530980059445, 0.08120401396593267
        - 20000: 0.8399308212634944 0.08111430205756767
    - (1, 2)
        - max-k: 0.8294502665752119, 0.08421253009084147
        - 20000: 0.8398228277779543 0.07824510376730294
    - (1, 3)
        - max-k: 0.8230380243367905, 0.08660127058036823
    - (1, 10):
        - 70000: 0.8359176499242273 0.08504483506085675
        
Выбираем CountVectorizer(ngram_range=(1, 2), max_features=20000)

0.9197500019625826

0.9315069358401142,
 0.044345068769381366 0 2 6

0.9180564468510812,
 0.05889198351691508 baseline

0.9232962337585114,
 0.058257317090901546

0.9287111177172779,
 0.04434090003756869 2

### Тест

In [70]:
# import eli5
# import calendar

def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                               
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [73]:
%%time

sites_test_vectorized = vectorizer_pipeline.transform(df_test)
times_test = time_features_pipeline.transform(df_test)
times_test = pd.DataFrame(times_test, columns=all_features)
X_test = hstack([sites_test_vectorized, times_test[selected_features]])
X_test

CPU times: user 4.1 s, sys: 44.9 ms, total: 4.15 s
Wall time: 4.15 s


<82797x20004 sparse matrix of type '<class 'numpy.float64'>'
	with 1084813 stored elements in COOrdinate format>

In [74]:
# search.best_estimator_.fit(X_train, y_train)
# logit_test_pred_proba = search.best_estimator_.predict_proba(X_test)
model.fit(X_train, y_train)
logit_test_pred_proba = model.predict_proba(X_test)

In [None]:
# pd.DataFrame({'feature': feature_names, 'coef': search.best_estimator_.coef_.flatten()[-len(feature_names):]})

In [None]:
# eli5.show_weights(estimator=search.best_estimator_, feature_names=vectorizer_pipeline['vectorizer'].get_feature_names() \
#                   + feature_names, top=30)

In [75]:
write_to_submission_file(logit_test_pred_proba[:, 1], 'result.csv')