In [1]:
# from scipy.sparse import csr_matrix

### Входные данные

#### Загрузка

In [2]:
# !unzip capstone_user_identification.zip

In [1]:
# from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import pickle

In [40]:
PATH_TO_DATA = '../capstone_user_identification'
# PATH_TO_DATA = '../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2'

times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]
df_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id', parse_dates=times)
df_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id', parse_dates=times)
df_train.shape, df_test.shape

((253561, 21), (82797, 20))

In [3]:
# df_train['year'] = df_train['time1'].apply(lambda ts: ts.year)
# df_train['month'] = df_train['time1'].apply(lambda ts: ts.month)

# df_train = df_train[df_train.year == 2014]

# df_train = df_train[~((df_train.year == 2013) & (df_train.month.isin(range(5))))]

Словарь сайтов

In [6]:
with open(os.path.join(PATH_TO_DATA, 'site_dic.pkl'), 'rb') as f:
    site2id  = pickle.load(f)
id2site = {v:k for (k, v) in site2id.items()}
id2site[0] = 'unknown'

#### Обработка

In [41]:
df_train.sort_values(by='time1', inplace=True)

y_train = df_train["target"].astype('int').values

### Feature engineering

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

import numpy as np
import calendar

#### Sites

In [9]:
class SiteFeaturesPreparator(BaseEstimator, TransformerMixin):
    """
    Fill NaN with zero values;
    Prepare a (Count)Vectorizer friendly 2D-list from data.
    """
    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1, 11)]
        # Convert dataframe rows to strings
#         return X[sites].fillna(0).astype('int')#.apply(lambda row: ' '.join([self.id2site[i] for i in row]), axis=1).tolist()
        return X[sites].fillna(0).astype('int').apply(lambda row: ' '.join([str(site_id) for site_id in row]), axis=1).tolist()

In [42]:
vectorizer_pipeline = Pipeline([
    ("preparator", SiteFeaturesPreparator()),
    ("vectorizer", CountVectorizer(ngram_range=(1, 2), max_features=20000))  # tokenizer=lambda s: s.split()
#     ("vectorizer", TfidfVectorizer(ngram_range=(1, 10), max_features=70000))
])

In [43]:
%%time
sites_train_vectorized = vectorizer_pipeline.fit_transform(df_train)
sites_train_vectorized

CPU times: user 7.16 s, sys: 110 ms, total: 7.27 s
Wall time: 7.27 s


<253561x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 2498250 stored elements in Compressed Sparse Row format>

In [12]:
class SitesFeaturesAdder(BaseEstimator, TransformerMixin):
    """
    Add new attributes to training and test set.
    """ 
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1, 11)]
        
#         n_sites = X[sites].apply(lambda session: session.count(), axis=1)
        n_unique = X[sites].fillna(0).apply(lambda session: session.nunique(), axis=1)
        dummy_n_unique = pd.get_dummies(n_unique)
        
        site_counts = {}
        for session in X[sites].fillna(0).values:
            for site in session:
                if site in site_counts:
                    site_counts[site] += 1
                else:
                    site_counts[site] = 1
        site_counts.pop(0)
        top_n = [k for k, _ in sorted(site_counts.items(), key=lambda item: item[1], reverse=True)][:10]
        has_top10 = X[sites].fillna(0).apply(lambda session: session.isin(top_n).any().astype('int'), axis=1)
        
        features_df = pd.DataFrame({
#             'n_sites': n_sites,
            'n_unique': n_unique,
            'has_top10': has_top10,
        })
        
        return pd.concat([features_df, dummy_n_unique], axis=1)
    
all_site_features = ['n_unique', 'has_top10'] + [f'{i}_unique' for i in range(1, 11)]  # has_top10 n_sites

In [44]:
%%time
site_features_pipeline = Pipeline([
    ("sites_adder", SitesFeaturesAdder()),
    ("scaler", StandardScaler())
])

sites_train = site_features_pipeline.fit_transform(df_train)
sites_train = pd.DataFrame(sites_train, columns=all_site_features, index=df_train.index)
sites_train

CPU times: user 1min 12s, sys: 1 s, total: 1min 13s
Wall time: 1min 12s


Unnamed: 0_level_0,n_unique,has_top10,1_unique,2_unique,3_unique,4_unique,5_unique,6_unique,7_unique,8_unique,9_unique,10_unique
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
21669,-1.126296,-1.400389,-0.214779,-0.302506,3.211641,-0.323591,-0.380664,-0.408702,-0.421505,-0.393215,-0.318561,-0.210156
54843,-1.126296,-1.400389,-0.214779,-0.302506,3.211641,-0.323591,-0.380664,-0.408702,-0.421505,-0.393215,-0.318561,-0.210156
77292,0.112680,-1.400389,-0.214779,-0.302506,-0.311367,-0.323591,-0.380664,2.446772,-0.421505,-0.393215,-0.318561,-0.210156
114021,-0.300312,-1.400389,-0.214779,-0.302506,-0.311367,-0.323591,2.626988,-0.408702,-0.421505,-0.393215,-0.318561,-0.210156
146670,0.112680,-1.400389,-0.214779,-0.302506,-0.311367,-0.323591,-0.380664,2.446772,-0.421505,-0.393215,-0.318561,-0.210156
...,...,...,...,...,...,...,...,...,...,...,...,...
12224,0.112680,0.714087,-0.214779,-0.302506,-0.311367,-0.323591,-0.380664,2.446772,-0.421505,-0.393215,-0.318561,-0.210156
164438,0.525672,0.714087,-0.214779,-0.302506,-0.311367,-0.323591,-0.380664,-0.408702,2.372451,-0.393215,-0.318561,-0.210156
12221,0.938664,0.714087,-0.214779,-0.302506,-0.311367,-0.323591,-0.380664,-0.408702,-0.421505,2.543140,-0.318561,-0.210156
156968,0.525672,-1.400389,-0.214779,-0.302506,-0.311367,-0.323591,-0.380664,-0.408702,2.372451,-0.393215,-0.318561,-0.210156


#### Times

In [188]:
class TimesFeaturesAdder(BaseEstimator, TransformerMixin):
    """
    Add new attributes to training and test set.
    """ 
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        # intraday features
        hour = X['time1'].apply(lambda ts: ts.hour)
        minutes = X['time1'].apply(lambda ts: ts.minute)
        minutes_10 = (minutes >= 0) & (minutes < 10)
        minutes_20 = (minutes >= 10) & (minutes < 20)
        minutes_30 = (minutes >= 20) & (minutes < 30)
        minutes_40 = (minutes >= 30) & (minutes < 40)
        minutes_50 = (minutes >= 40) & (minutes < 50)
        minutes_60 = (minutes >= 50) & (minutes < 60)
        hour_minutes = hour + minutes / 60
        hour_minutes_16 = (hour_minutes > 16) & (hour_minutes < 17)
        dummy_hours = pd.get_dummies(hour)
        
        morning = ((hour >= 7) & (hour <= 11)).astype('int')
        midday = ((hour >= 12) & (hour <= 18)).astype('int')
        evening = ((hour >= 19) & (hour <= 23)).astype('int')
        
        sin_hour_minutes = np.sin(2 * np.pi * hour_minutes / 24)
        cos_hour_minutes = np.cos(2 * np.pi * hour_minutes / 24)
        
        # date
        day = X['time1'].apply(lambda ts: ts.day)
        day_3 = (day == 3)
        day_23 = (day == 23)
        month = X['time1'].apply(lambda ts: ts.month)
#         dummy_months = pd.get_dummies(month)
        year = X['time1'].apply(lambda ts: ts.year)
        year_month = year + month
        summer = ((month >= 6) & (month <= 8)).astype('int')
        october = (month == 10)
        
        weekday = X['time1'].apply(lambda ts: ts.weekday())
        dummy_weekdays = pd.get_dummies(data=weekday)
        weekend = np.isin(weekday, [5, 6])
        
        times = ['time%s' % i for i in range(1, 11)]
        session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype(int) #** 0.2
        
        features_df = pd.DataFrame({
            'hour': hour,
            'morning': morning,
            'midday': midday,
            'evening': evening,
            'sin_hour_minutes': sin_hour_minutes,
            'cos_hour_minutes': cos_hour_minutes,
            'minutes': minutes,
            'minutes_10': minutes_10,
            'minutes_20': minutes_20,
            'minutes_30': minutes_30,
            'minutes_40': minutes_40,
            'minutes_50': minutes_50,
            'minutes_60': minutes_60,
            'hour_minutes': hour_minutes,
            'hour_minutes_16': hour_minutes_16,
            'day': day,
            'day_3': day_3,
            'day_23': day_23,
            'october': october,
            'year': year,
            'year_month': year_month,
            'summer': summer,
            'weekday': weekday,
            'weekend': weekend,
            'session_duration': session_duration,
        })

        return pd.concat([features_df, dummy_hours, dummy_weekdays], axis=1)  # dummy_months
    
all_time_features = ['hour', 'morning', 'midday', 'evening', 'sin_hour_minutes', 'cos_hour_minutes',
                'minutes', 'minutes_10', 'minutes_20', 'minutes_30', 'minutes_40', 'minutes_50', 'minutes_60', 
                     'hour_minutes', 'hour_minutes_16', 'day', 'day_3', 'day_23', 'october', 'year', 'year_month',
                'summer', 'weekday', 'weekend', 'session_duration'] + list(range(7, 24)) + list(calendar.day_name) #\
#                                     + list(calendar.month_name)[1:]

In [189]:
%%time
time_features_pipeline = Pipeline([
    ("times_adder", TimesFeaturesAdder()),
    ("scaler", StandardScaler())
])

times_train = time_features_pipeline.fit_transform(df_train)
times_train = pd.DataFrame(times_train, columns=all_time_features, index=df_train.index)
times_train

CPU times: user 6.95 s, sys: 181 ms, total: 7.13 s
Wall time: 7.1 s


Unnamed: 0_level_0,hour,morning,midday,evening,sin_hour_minutes,cos_hour_minutes,minutes,minutes_10,minutes_20,minutes_30,...,21,22,23,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,-1.357366,1.039061,-0.98128,-0.171577,1.573352,0.527614,-1.400885,2.220236,-0.45577,-0.452612,...,-0.082278,-0.076284,-0.072862,-0.436072,-0.487314,-0.532230,-0.459143,-0.440082,3.879327,-0.172561
54843,-1.357366,1.039061,-0.98128,-0.171577,1.440845,0.204134,0.451667,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,-0.072862,-0.436072,-0.487314,-0.532230,-0.459143,-0.440082,3.879327,-0.172561
77292,-1.357366,1.039061,-0.98128,-0.171577,1.379766,0.082374,1.204266,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,-0.072862,-0.436072,-0.487314,-0.532230,-0.459143,-0.440082,3.879327,-0.172561
114021,-1.357366,1.039061,-0.98128,-0.171577,1.379766,0.082374,1.204266,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,-0.072862,-0.436072,-0.487314,-0.532230,-0.459143,-0.440082,3.879327,-0.172561
146670,-1.357366,1.039061,-0.98128,-0.171577,1.379766,0.082374,1.204266,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,-0.072862,-0.436072,-0.487314,-0.532230,-0.459143,-0.440082,3.879327,-0.172561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12224,3.390349,-0.962408,-0.98128,5.828302,-0.026963,4.820082,0.220098,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,13.724647,-0.436072,-0.487314,1.878888,-0.459143,-0.440082,-0.257777,-0.172561
164438,3.390349,-0.962408,-0.98128,5.828302,-0.019830,4.821512,0.277990,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,13.724647,-0.436072,-0.487314,1.878888,-0.459143,-0.440082,-0.257777,-0.172561
12221,3.390349,-0.962408,-0.98128,5.828302,0.008734,4.826691,0.509559,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,13.724647,-0.436072,-0.487314,1.878888,-0.459143,-0.440082,-0.257777,-0.172561
156968,3.390349,-0.962408,-0.98128,5.828302,0.008734,4.826691,0.509559,-0.450403,-0.45577,-0.452612,...,-0.082278,-0.076284,13.724647,-0.436072,-0.487314,1.878888,-0.459143,-0.440082,-0.257777,-0.172561


In [49]:
class TimeDiffsFeaturesAdder(BaseEstimator, TransformerMixin):
    """
    Add new attributes to training and test set.
    """ 
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        def get_time_diff(row): 
            time_length = row.shape[0] - 1 
            time_diff = [0] * time_length 
            i = 0
            while (i < time_length) and pd.notnull(row[i+1]):
                time_diff[i] = (row[i+1] - row[i]) / np.timedelta64(1, 's') 
                i += 1 
            return time_diff
        
        time_diff = []
        times = ['time%s' % i for i in range(1, 11)]
        for row in df_train[times].values:
            time_diff.append(get_time_diff(row))    
        
        return time_diff

In [50]:
%%time
time_diffs_features_pipeline = Pipeline([
    ("times_adder", TimeDiffsFeaturesAdder()),
#     ("scaler", StandardScaler())
])

time_diffs_train = time_diffs_features_pipeline.fit_transform(df_train)
diff_names = [f'time_diff_{i}' for i in range(1, 10)]
time_diffs_train = pd.DataFrame(time_diffs_train, columns=diff_names, index=df_train.index)
time_diffs_train

CPU times: user 14.1 s, sys: 48.4 ms, total: 14.1 s
Wall time: 14.1 s


Unnamed: 0_level_0,time_diff_1,time_diff_2,time_diff_3,time_diff_4,time_diff_5,time_diff_6,time_diff_7,time_diff_8,time_diff_9
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
21669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54843,0.0,1784.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
77292,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
114021,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
146670,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
12224,1.0,3.0,0.0,1.0,0.0,1.0,0.0,3.0,3.0
164438,1.0,1.0,1.0,0.0,58.0,13.0,43.0,30.0,31.0
12221,2.0,3.0,5.0,4.0,2.0,11.0,0.0,0.0,1.0
156968,0.0,2.0,0.0,1.0,1.0,0.0,27.0,1.0,45.0


In [19]:
# class ScaledAttributesAdder(BaseEstimator, TransformerMixin):
#     """
#     Add new features, that should be scaled.
#     """
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X, y=None):
#         # session time features
#         times = ['time%s' % i for i in range(1, 11)]
#         sites = ['site%s' % i for i in range(1, 11)]
        
#         # session duration: take to the power of 1/5 to normalize the distribution
#         session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype(int) #** 0.2
        
#         # number of sites visited in a session
#         number_of_sites = X[times].isnull().sum(axis=1).apply(lambda x: 10 - x)
        
#         # average time spent on one site during a session
#         time_per_site = (session_duration / number_of_sites) #** 0.2
        
# #         unique_sites = X[sites].apply(lambda sites: sites.nunique(), axis=1)
        
#         X = np.c_[session_duration.values]
#         return X

In [20]:
# feature_pipeline = FeatureUnion(transformer_list=[
#     ('attributes_pipeline', attributes_pipeline),
#     ('scaled_attributes_pipeline', scaled_attributes_pipeline)
# ])

### Feature selection

In [18]:
from scipy.sparse import hstack

##### Отберём строки за 2014 год

In [22]:
# times_train['target'] = y_train

In [23]:
# times_train.year.value_counts()

2013: -1.5477491026974153

2014: 0.6460995507975896

In [24]:
# times_train_2014 = times_train[times_train.year == 0.6460995507975896]
# sites_train_2014 = sites_train.loc[times_train_2014.index]

In [25]:
# times_train.target.value_counts(), times_train_2014.target.value_counts()

In [26]:
# 251264 / 177645, 2297 / 1241

Индексация csr-матрицы с нуля. Подготовим индексы для отбора нужных строк.

In [27]:
# times_train_reset_index = times_train.reset_index()

In [28]:
# times_train_reset_index_2014 = times_train_reset_index[times_train_reset_index.year == 0.6460995507975896]

In [29]:
# sites_train_vectorized[times_train_reset_index_2014.index]

In [30]:
# def select_2014(times, sites, sites_vectorized):
#     times_reset_index = times.reset_index()
    
#     times_2014 = times_reset_index[times_reset_index.year == 0.6460995507975896]
#     sites_2014 = sites.loc[times_2014.index]
    
#     times_reset_index_2014 = times_reset_index[times_reset_index.year == 0.6460995507975896]
#     sites_vectorized_2014 = sites_vectorized[times_reset_index_2014.index]
    
#     return times_2014, sites_2014, sites_vectorized_2014

In [31]:
# times_train_2014, sites_train_2014, sites_train_vectorized_2014 = select_2014(times_train, sites_train, sites_train_vectorized)

In [127]:
print(all_time_features)

['hour', 'morning', 'midday', 'evening', 'sin_hour_minutes', 'cos_hour_minutes', 'minutes', 'minutes_10', 'minutes_20', 'minutes_30', 'minutes_40', 'minutes_50', 'minutes_60', 'hour_minutes', 'day', 'october', 'year', 'year_month', 'summer', 'weekday', 'weekend', 'session_duration', 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


In [52]:
print(all_site_features)

['n_unique', 'has_top10', '1_unique', '2_unique', '3_unique', '4_unique', '5_unique', '6_unique', '7_unique', '8_unique', '9_unique', '10_unique']


##############################################################

In [190]:
# selected_time_features = ['morning', 'day', 'evening', 14, 15, 'year', 'Monday', 'Sunday', 'session_duration']
# selected_site_features = ['7_unique']

# selected_time_features = ['morning', 'day', 'evening', 14, 15, 18, 'Wednesday', 'Friday', 'session_duration'] \
#                                 + list(range(7, 12))
# selected_site_features = ['1_unique', '4_unique', 'has_top10']

selected_time_features = ['morning', 'midday', 'evening', 'session_duration', 14, 15, 'Monday', 'Sunday', 'year', 'day_3', 
                          'day_21']
selected_site_features = ['7_unique']

X_train = hstack([sites_train_vectorized, times_train[selected_time_features], 
                  sites_train[selected_site_features]]) # time_diffs_train[['time_diff_1', 'time_diff_2']]
X_train

<253561x20012 sparse matrix of type '<class 'numpy.float64'>'
	with 5540982 stored elements in COOrdinate format>

February March April October

##############################################################

### Модель

In [23]:
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

#### Обучение и подбор гиперпараметров

In [60]:
model = SGDClassifier(loss='log', random_state=2179, n_jobs=-1)  # class_weight='balanced'
time_split = TimeSeriesSplit(n_splits=10)

In [57]:
grid = {
    'penalty': ['l2', 'l1'],
    'alpha': np.linspace(3e-05, 5e-05, 11)
}

In [156]:
%%time

search = GridSearchCV(model, param_grid=grid, cv=time_split, scoring='roc_auc')
search.fit(X_train, y_train)

search.best_params_, search.best_score_

CPU times: user 5min 10s, sys: 3.26 s, total: 5min 14s
Wall time: 2min 43s


({'alpha': 3.6e-05, 'penalty': 'l2'}, 0.9379969621239443)

0.9234912596145423

0.9429067044711336

##############################################################

In [191]:
%%time

cv_scores = cross_val_score(model, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)
# error_score='raise'
print(cv_scores.mean(), cv_scores.std())
cv_scores

0.938267552368661 0.04026401986386452
CPU times: user 181 ms, sys: 50.5 ms, total: 232 ms
Wall time: 1.33 s


array([0.91616599, 0.84761044, 0.96526542, 0.95278538, 0.9533779 ,
       0.97019795, 0.88226456, 0.96019612, 0.96642744, 0.96838434])

##############################################################

Опции:

- CountVectorizer ngram_range max_features
    - (1, 1)
        - max-k: 0.8530563468012738, 0.08925871896536618
        - 20000: 0.8531100483866421, 0.08919480787349321
        - 10000: 0.8539501616208991, 0.08875887000416063
        - 5000: 0.8529069996789713, 0.08888832394047236
    - (1, 2)
        - max-k: 0.8577666221636842, 0.09371826647019514
        - 50000: 0.858414628277273, 0.09308624515067462
        - 20000: 0.8600043484189314, 0.0901820258132247
        - 10000: 0.8565645409290898, 0.09086786443064755
    - (1, 3)
        - max-k: 0.8560039171386127, 0.09631952839070929
        - 50000: 0.859467798898789, 0.09245656399159023
        - 20000: 0.8598701871410264 0.09139402713364686
        - 10000: 0.8560423530991228 0.09031323488865993
    - (1, 4)
        - max-k: 0.8533973079027204 0.10052454183974614
        - 20000: 0.8455330939583249 0.08401555400318593
    - (1, 9)
        - 70000: .8604293420872524 0.09704219048563271
    - (1, 10)
        - 20000: 0.8419623744960409 0.0864989715893941
        - 50000: 0.859496836616332 0.09749267442904218
        - 70000: 0.8604687084516304 0.0970295359566777
        - 100000: 0.8596446298744558 0.09781978585361331
- TfidfVectorizer ngram_range max_features
    - (1, 1)
        - max-k: 0.8391530980059445, 0.08120401396593267
        - 20000: 0.8399308212634944 0.08111430205756767
    - (1, 2)
        - max-k: 0.8294502665752119, 0.08421253009084147
        - 20000: 0.8398228277779543 0.07824510376730294
    - (1, 3)
        - max-k: 0.8230380243367905, 0.08660127058036823
    - (1, 10):
        - 70000: 0.8359176499242273 0.08504483506085675
        
Выбираем CountVectorizer(ngram_range=(1, 2), max_features=20000)

### Тест

In [27]:
# import eli5
# import calendar

def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [157]:
%%time

sites_test_vectorized = vectorizer_pipeline.transform(df_test)

sites_test = site_features_pipeline.transform(df_test)
sites_test = pd.DataFrame(sites_test, columns=all_site_features)

times_test = time_features_pipeline.transform(df_test)
times_test = pd.DataFrame(times_test, columns=all_time_features)

# times_test_2014, sites_test_2014, sites_test_vectorized_2014 = select_2014(times_test, sites_test, sites_test_vectorized)

X_test = hstack([sites_test_vectorized, times_test[selected_time_features], sites_test[selected_site_features]])
X_test

CPU times: user 28 s, sys: 684 ms, total: 28.7 s
Wall time: 28.3 s


<82797x20011 sparse matrix of type '<class 'numpy.float64'>'
	with 1664392 stored elements in COOrdinate format>

In [158]:
search.best_estimator_.fit(X_train, y_train)
logit_test_pred_proba = search.best_estimator_.predict_proba(X_test)
# model.fit(X_train, y_train_2014)
# logit_test_pred_proba = model.predict_proba(X_test)

In [None]:
# pd.DataFrame({'feature': feature_names, 'coef': search.best_estimator_.coef_.flatten()[-len(feature_names):]})

In [None]:
# eli5.show_weights(estimator=search.best_estimator_, feature_names=vectorizer_pipeline['vectorizer'].get_feature_names() \
#                   + feature_names, top=30)

In [159]:
write_to_submission_file(logit_test_pred_proba[:, 1], 'result.csv')