In [1]:
import functions as f
# Import libraries and set desired options
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

In [2]:
PATH_TO_DATA = '../data/'
SEED = 17

# 1. Making the first baseline 
### a) CountVectorizer

In this baselines site ids are converted into real names from the dictionary

In [53]:
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           Tfidf, vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    if Tfidf:
        vectorizer = TfidfVectorizer(**vectorizer_params)
    else:
        vectorizer = CountVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

In [54]:
%%time
X_train_sites_v1, X_test_sites_v1, y_train, vectorizer_v1, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    Tfidf = False,
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)

Wall time: 48.3 s


In [55]:
print(X_train_sites_v1.shape, X_test_sites_v1.shape)

(253561, 50000) (82797, 50000)


In [56]:
%%time
X_train_sites_v2, X_test_sites_v2, y_train, vectorizer_v2, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    Tfidf = True,
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)

Wall time: 49.6 s


In [57]:
print(X_train_sites_v2.shape, X_test_sites_v2.shape)

(253561, 50000) (82797, 50000)


In [58]:
time_split = TimeSeriesSplit(n_splits=10)

In [59]:
logit = LogisticRegression(C=1, random_state=SEED, solver='liblinear')

In [60]:
%%time

cv_scores1 = cross_val_score(logit, X_train_sites_v1, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 1min 16s


In [61]:
cv_scores1, cv_scores1.mean(), cv_scores1.std()

(array([0.83945399, 0.64233622, 0.8761074 , 0.95673657, 0.8467993 ,
        0.88187478, 0.92623237, 0.85574333, 0.92765285, 0.90708443]),
 0.8660021244361671,
 0.08301101232591739)

In [62]:
%%time

cv_scores2 = cross_val_score(logit, X_train_sites_v2, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 10.8 s


In [63]:
cv_scores2, cv_scores2.mean(), cv_scores2.std()

(array([0.83124023, 0.65993466, 0.85673565, 0.92824237, 0.84777348,
        0.88954524, 0.88829128, 0.8771044 , 0.92023038, 0.92624125]),
 0.8625338952306734,
 0.07455716136277274)

In [64]:
cv_scores1 > cv_scores2

array([ True, False,  True,  True, False, False,  True, False,  True,
       False])

# 2. Add new attributes to both training sets

In [65]:
def add_time_features(times, X_sparse, add_hour = False):
    # Hour when a session begins
    hour = times['time1'].apply(lambda ts: ts.hour)
    # Part of he day when a session begins
    morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
    night = ((hour >= 0) & (hour <=6)).astype('int').values.reshape(-1, 1)
    
    
    # list of the objects to add to the sparse matrix
    objects_to_hstack = [X_sparse, morning, day, evening, night]
    feature_names = ['morning', 'day', 'evening', 'night']
    
    # dealing with hours
    if add_hour:
        # we'll do it right and scale hour dividing by 24
        objects_to_hstack.append(hour.values.reshape(-1, 1) / 24)
        feature_names.append('hour')
    
    # Stacking new time features to the sparse matrix
    X = hstack(objects_to_hstack)
    return X, feature_names

In [66]:
%%time
X_train_v1_with_times1, new_feat_names = add_time_features(train_times, X_train_sites_v1)
X_test_v1_with_times1, _ = add_time_features(test_times, X_test_sites_v1)


X_train_v2_with_times1, new_feat_names = add_time_features(train_times, X_train_sites_v2)
X_test_v2_with_times1, _ = add_time_features(test_times, X_test_sites_v2)

Wall time: 7.39 s


In [67]:
def train_and_valid(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    #test_pred = model.predict_proba(X_test)[:, 1]
    #write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores

In [68]:
cv_scores_v1_1 = train_and_valid(model=logit, X_train=X_train_v1_with_times1, y_train=y_train, 
                               X_test=X_test_v1_with_times1, 
                               site_feature_names=vectorizer_v1.get_feature_names(),
                               new_feature_names=new_feat_names,
                               cv=time_split, submission_file_name='subm3.csv')

CV scores [0.87872847 0.7520995  0.92828645 0.9746255  0.90668159 0.94070162
 0.95171061 0.92777855 0.94741928 0.94159507]
CV mean: 0.9149626651898177, CV std: 0.05965144601121151


Weight?,Feature
+4.578,www.express.co.uk
+4.372,cid-ed6c3e6a5c6608a4.users.storage.live.com
+3.410,tru.am
+3.216,browser-update.org
+2.671,www.springboardplatform.com
+2.664,www.regarder-film-gratuit.com
+2.335,glee.hypnoweb.net
+2.264,s.radio-canada.ca
+2.161,youtube.fr
+2.136,mcetv.fr


New feature weights:
   feature      coef
0  morning -3.755702
1      day  0.417087
2  evening -2.747076
3    night  0.000000


In [69]:
cv_scores_v2_1 = train_and_valid(model=logit, X_train=X_train_v2_with_times1, y_train=y_train, 
                               X_test=X_test_v2_with_times1, 
                               site_feature_names=vectorizer_v2.get_feature_names(),
                               new_feature_names=new_feat_names,
                               cv=time_split, submission_file_name='subm3.csv')

# LB score : 0.94535 

CV scores [0.88170019 0.81426901 0.91861327 0.96171682 0.91532998 0.95227476
 0.92804771 0.94016178 0.94665449 0.95277436]
CV mean: 0.9211542365590741, CV std: 0.04206443092499144


Weight?,Feature
+5.218,youwatch.org
+5.093,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.971,www.express.co.uk
+4.914,vk.com
+4.640,fr.glee.wikia.com
+4.542,www.info-jeunes.net
+4.382,www.melty.fr
+4.183,www.audienceinsights.net
+4.104,www.banque-chalus.fr
+3.823,r4---sn-gxo5uxg-jqbe.googlevideo.com


New feature weights:
   feature      coef
0  morning -3.211819
1      day  0.526636
2  evening -2.721465
3    night  0.000000


In [70]:
cv_scores_v2_1 > cv_scores_v1_1

array([ True,  True, False, False,  True,  True, False,  True, False,
        True])

In [97]:
def new_time_features_1(times, X_sparse):
    # Session duration
    durations = (times.max(axis=1) - times.min(axis=1)).astype('timedelta64[ms]').astype(int) **0.2
    scaler = StandardScaler()
    dur_scaled = scaler.fit_transform(durations.values.reshape(-1, 1))
    
    # day of the week and month and year and summer
    day_of_week = times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    month = times['time1'].apply(lambda t: t.month).values.reshape(-1, 1) 
    summer = ((month >= 6) & (month <= 8)).astype('int')
    #year = times['time1'].dt.year.values.reshape(-1, 1)
    
    # linear trend: time in a form YYYYMM, we'll divide by 1e5 to scale this feature 
    year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
    
    # number of sites visited in a session
    #times = ['time%s' % i for i in range(1, 11)]
    #number_of_sites = X_sparse[times].isnull().sum(axis=1).apply(lambda x: 10 - x)
    #time_per_site = (durations / number_of_sites) 
    #time_per_site_scaled = scaler.fit_transform(time_per_site.values.reshape(-1,1))
    
    
    objects_to_hstack = [X_sparse, dur_scaled, day_of_week, month, summer, year_month]
    feature_names = ['duration', 'day_of_week', 'month', 'summer', 'year_month']
    
    X = hstack(objects_to_hstack)
    return X, feature_names

In [98]:
X_train_v2_with_times2, new_feat_names_1 = new_time_features_1(train_times, X_train_v2_with_times1)
X_test_v2_with_times2, _ = new_time_features_1(test_times, X_test_v2_with_times1)

In [99]:
len(new_feat_names)

4

In [105]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [106]:
def train_and_valid(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores

In [107]:
cv_scores_v2_2 = train_and_valid(model=logit, X_train=X_train_v2_with_times2, y_train=y_train, 
                               X_test=X_test_v2_with_times2, 
                               site_feature_names=vectorizer_v2.get_feature_names(),
                               new_feature_names=new_feat_names + new_feat_names_1,
                               cv=time_split, submission_file_name='subm_alice_1_raw.csv')

CV scores [0.72721963 0.81225685 0.90934938 0.9614563  0.91757658 0.95986206
 0.92788684 0.95219127 0.95795891 0.96884404]
CV mean: 0.9094601871932205, CV std: 0.07484250050384401


Weight?,Feature
+5.135,youwatch.org
+5.009,vk.com
+4.974,www.express.co.uk
+4.947,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.734,www.info-jeunes.net
+4.436,www.melty.fr
+4.351,fr.glee.wikia.com
+4.278,www.audienceinsights.net
+4.048,www.banque-chalus.fr
+3.969,api.bing.com


New feature weights:
       feature      coef
0      morning -1.649414
1          day  2.055541
2      evening -1.736857
3        night  0.000000
4     duration -0.210780
5  day_of_week -0.368635
6        month  0.108070
7       summer -2.118289
8   year_month -2.743998


In [108]:
cv_scores_v2_2 > cv_scores_v2_1

array([False, False, False, False,  True,  True, False,  True,  True,
        True])

In [109]:
# LB score: 0.95285 !!!

# 3. Tuning params

In [110]:
c_values = np.logspace(-2, 2, 20)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [111]:
X_train_final = X_train_v2_with_times2

In [112]:
%%time
logit_grid_searcher.fit(X_train_final, y_train); 

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 10.9min finished


Wall time: 11min 9s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-02, 1.62378e-02, 2.63665e-02, 4.28133e-02, 6.95193e-02,
       1.12884e-01, 1.83298e-01, 2.97635e-01, 4.83293e-01, 7.84760e-01,
       1.27427e+00, 2.06914e+00, 3.35982e+00, 5.45559e+00, 8.85867e+00,
       1.43845e+01, 2.33572e+01, 3.79269e+01, 6.15848e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [113]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9131969354044924, {'C': 3.359818286283781})

In [114]:
final_model = logit_grid_searcher.best_estimator_

In [118]:
cv_scores_final = train_and_valid(model=final_model, X_train=X_train_final, y_train=y_train, 
                               X_test=X_test_v2_with_times2, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names + new_feat_names_1,
                               cv=time_split, submission_file_name='subm_alice_1_final.csv')

CV scores [0.75140875 0.79892399 0.91806782 0.96774999 0.92010316 0.95733631
 0.94266369 0.94726406 0.96189886 0.96655273]
CV mean: 0.9131969354044924, CV std: 0.07174750312385479


Weight?,Feature
+11.148,www.express.co.uk
+9.755,cid-ed6c3e6a5c6608a4.users.storage.live.com
+7.032,tru.am
+6.335,browser-update.org
+5.999,fr.glee.wikia.com
+5.815,api.bing.com
+5.808,www.banque-chalus.fr
+5.627,youwatch.org
+5.480,vk.com
+5.398,www.info-jeunes.net


New feature weights:
       feature      coef
0      morning -1.462071
1          day  2.330922
2      evening -2.329141
3        night  0.000000
4     duration -0.165413
5  day_of_week -0.373164
6        month  0.124287
7       summer -2.970617
8   year_month -3.156360


In [119]:
cv_scores_final > cv_scores_v2_2

array([ True, False,  True,  True,  True, False,  True, False,  True,
       False])