In [1]:
import functions as f
# Import libraries and set desired options
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

In [2]:
PATH_TO_DATA = '../data/'
SEED = 17

In [3]:
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

In [5]:
%%time
X_train, X_test, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)

Wall time: 50.3 s


In [6]:
print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)


In [68]:
def add_old_time_features(times, X_sparse):
    hour = times['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
    night = ((hour >= 0) & (hour <=6)).astype('int').values.reshape(-1, 1)
    
    # each session duration
    durations = (times.max(axis=1) - times.min(axis=1)).astype('timedelta64[ms]').astype(int) **0.2
    scaler = StandardScaler()
    dur_scaled = scaler.fit_transform(durations.values.reshape(-1, 1))
    
    # day of the week and month and year and summer
    day_of_week = times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    month = times['time1'].apply(lambda t: t.month).values.reshape(-1, 1) 
    summer = ((month >= 6) & (month <= 8)).astype('int')
    #year = times['time1'].dt.year.values.reshape(-1, 1)
    
    # linear trend: time in a form YYYYMM, we'll divide by 1e5 to scale this feature 
    year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
    
    # list of the objects to add to the sparse matrix
    objects_to_hstack = [X_sparse, morning, day, evening, night, dur_scaled, day_of_week, month, summer, year_month]
    feature_names = ['morning', 'day', 'evening', 'night', 'duration', 'day_of_week', 'month', 'summer', 'year_month']
    
    X = hstack(objects_to_hstack)
    return X, feature_names

In [69]:
X_train_with_times, old_feat_names = add_old_time_features(train_times, X_train)
X_test_with_times, _ = add_old_time_features(test_times, X_test)

In [70]:
X_train_with_times.shape

(253561, 50009)

In [71]:
time_split = TimeSeriesSplit(n_splits=10)
logit = LogisticRegression(C=1, random_state=SEED, solver='liblinear')

In [72]:
def train_and_valid(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    #test_pred = model.predict_proba(X_test)[:, 1]
    #write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores

In [73]:
def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores

In [74]:
cv_scores_base = train_and_valid(model=logit, X_train=X_train_with_times, y_train=y_train, 
                               X_test=X_test_with_times, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=old_feat_names,
                               cv=time_split, submission_file_name='subm3.csv')

CV scores [0.72721963 0.81225685 0.90934938 0.9614563  0.91757658 0.95986206
 0.92788684 0.95219127 0.95795891 0.96884404]
CV mean: 0.9094601871932205, CV std: 0.07484250050384401


Weight?,Feature
+5.135,youwatch.org
+5.009,vk.com
+4.974,www.express.co.uk
+4.947,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.734,www.info-jeunes.net
+4.436,www.melty.fr
+4.351,fr.glee.wikia.com
+4.278,www.audienceinsights.net
+4.048,www.banque-chalus.fr
+3.969,api.bing.com


New feature weights:
       feature      coef
0      morning -1.649414
1          day  2.055541
2      evening -1.736857
3        night  0.000000
4     duration -0.210780
5  day_of_week -0.368635
6        month  0.108070
7       summer -2.118289
8   year_month -2.743998


In [75]:
#X_train_with_times['duration']

In [76]:
def add_new_time_features(times, X_sparse):
    #hour = times['time1'].dt.hour
    hour = times['time1'].dt.hour.values.reshape(-1, 1)
    
    day_of_week = times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    holiday = ((day_of_week >= 5) & (day_of_week <= 7)).astype('int')
    
    #minute = times['time1'].apply(lambda t: t.minute).values.reshape(-1, 1)
    minute = times['time1'].dt.minute.values.reshape(-1, 1)
    #year = times['time1'].apply(lambda t: t.year).values.reshape(-1, 1) 
    year = times['time1'].dt.year.values.reshape(-1, 1)
    
    # number of sites visited in a session
    #times = ['time%s' % i for i in range(1, 11)]
    #durations = (times.max(axis=1) - times.min(axis=1)).astype('timedelta64[ms]').astype(int) **0.2
    #number_of_sites = times.isnull().sum(axis=1).apply(lambda x: 10 - x).values.reshape(-1,1)
    #time_per_site = (durations / number_of_sites) 
    #scaler = StandardScaler()
    #time_per_site_scaled = scaler.fit_transform(time_per_site.values.reshape(-1,1))
    
    # list of the objects to add to the sparse matrix
    objects_to_hstack = [X_sparse, minute]
    feature_names = ['minute']
    
    X = hstack(objects_to_hstack)
    return X, feature_names

In [77]:
X_train_with_times_1, new_feat_names_1 = add_new_time_features(train_times, X_train_with_times)
X_test_with_times_1, _ = add_new_time_features(test_times, X_test_with_times)

In [78]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [79]:
cv_scores_base = train_and_valid(model=logit, X_train=X_train_with_times_1, y_train=y_train, 
                               X_test=X_test_with_times_1, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=old_feat_names + new_feat_names_1,
                               cv=time_split, submission_file_name='subm_alice_2_2.csv')

CV scores [0.72675348 0.81234396 0.90834686 0.95838218 0.91310022 0.96101917
 0.92884558 0.95208859 0.95821263 0.96873899]
CV mean: 0.9087831664302033, CV std: 0.07480551554713125


Weight?,Feature
+5.126,youwatch.org
+5.011,vk.com
+4.977,www.express.co.uk
+4.954,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.730,www.info-jeunes.net
+4.440,www.melty.fr
+4.335,fr.glee.wikia.com
+4.302,www.audienceinsights.net
+4.080,www.banque-chalus.fr
+3.972,api.bing.com


New feature weights:
       feature      coef
0      morning -1.654926
1          day  2.051596
2      evening -1.743013
3        night  0.000000
4     duration -0.210470
5  day_of_week -0.369546
6        month  0.108145
7       summer -2.121872
8   year_month -2.774654
9       minute  0.002782


In [45]:
c_values = np.logspace(-2, 2, 20)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [46]:
X_train_final = X_train_with_times_1

In [47]:
%%time
logit_grid_searcher.fit(X_train_final, y_train); 

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [48]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'