### Good ideas

- bag of sites 1,2,3-grams
- time-aware cross-validation

### Notes

In [68]:
# X_train.todense() # MemoryError: Unable to allocate 78.6 GiB for an array with shape (253561, 41592) and data type int64

## Task

In [1]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score

from scipy.sparse import hstack
import pickle

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%alias head powershell -command "& {Get-Content %s -Head 10}"

In [4]:
# helper function for writing predictions
def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels, index=np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

### Data Downloading and Transformation

First, read the training and test sets.

In [78]:
train_df = pd.read_csv('./data/train_sessions.csv', index_col='session_id', parse_dates=['time1'])
test_df = pd.read_csv('./data/test_sessions.csv', index_col='session_id', parse_dates=['time1'])

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Extract labels
y_train = train_df.target.astype('int')

In [79]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [80]:
y_train.value_counts()

0    251264
1      2297
Name: target, dtype: int64

In [81]:
# Look at the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [82]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 21669 to 204762
Data columns (total 21 columns):
site1     253561 non-null int64
time1     253561 non-null datetime64[ns]
site2     250098 non-null float64
time2     250098 non-null object
site3     246919 non-null float64
time3     246919 non-null object
site4     244321 non-null float64
time4     244321 non-null object
site5     241829 non-null float64
time5     241829 non-null object
site6     239495 non-null float64
time6     239495 non-null object
site7     237297 non-null float64
time7     237297 non-null object
site8     235224 non-null float64
time8     235224 non-null object
site9     233084 non-null float64
time9     233084 non-null object
site10    231052 non-null float64
time10    231052 non-null object
target    253561 non-null int64
dtypes: datetime64[ns](1), float64(9), int64(2), object(9)
memory usage: 42.6+ MB


In [83]:
# Load websites dictionary
with open('./data/site_dic.pkl', mode='rb') as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


**Tranform data into format which can be fed into `CountVectorizer`**

In [84]:
sites = [f'site{s}' for s in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', sep=' ', index=0, header=0)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', sep=' ', index=0, header=0)
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [85]:
%head train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947
952 947 953 946 947 946 953 955 946 947
953 947 946 953 955 947 953 946 953 1033
946 947 954 953 946 954 946 956 957 956
946 956 946 946 955 954 946 946 946 948
948 946 948 784 49 53 812 982 52 52


**Fit `CountVectorizer` and transform data with it**

In [86]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as input_train_file:
    X_train = cv.fit_transform(input_train_file)
with open('test_sessions_text.txt') as input_test_file:
    X_test = cv.transform(input_test_file)

print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)
Wall time: 26.7 s


### Training the first model

In [87]:
logit = LogisticRegression(C=1, random_state=17, max_iter=500)

In [88]:
%%time
cv_scores = cross_val_score(estimator=logit, X=X_train, y=y_train, cv=5, scoring='roc_auc')

Wall time: 1min 16s


In [89]:
cv_scores, cv_scores.mean()

(array([ 0.89839966,  0.81991253,  0.86607294,  0.90081044,  0.910932  ]),
 0.87922551345871847)

In [90]:
%%time
logit.fit(X_train, y_train)

Wall time: 28.4 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [57]:
# CV 0.885
# logit_test_pred = logit.predict_proba(X_test)[:, 1]
# write_to_submission_file(predicted_labels=logit_test_pred, out_file='logit_subm1.txt') # 0.90703 ROC AUC Public LB

In [38]:
# CV 0.962 CV without sorting
# logit_test_pred = logit.predict_proba(X_test)[:, 1]
# write_to_submission_file(predicted_labels=logit_test_pred, out_file='logit_subm2.txt') # 0.90703 ROC AUC Public LB

In [65]:
# CV 0.879 + 2-grams + 3-grams ???
# logit_test_pred = logit.predict_proba(X_test)[:, 1]
# write_to_submission_file(predicted_labels=logit_test_pred, out_file='logit_subm4.txt') # 0.91288

### Time series cross-validation

In [91]:
time_split = TimeSeriesSplit(n_splits=10)

In [92]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

**Perform time series cross-validation with logistic regression**

In [93]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [94]:
%%time
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

Wall time: 1min 6s


In [95]:
cv_scores, cv_scores.mean()

(array([ 0.83141992,  0.64671142,  0.87992077,  0.9631551 ,  0.84221742,
         0.87840646,  0.94476054,  0.85321691,  0.92987691,  0.90752702]),
 0.8677212449964109)

**Train logistic regression with all training data, make predictions for test set and form a submission file**

In [71]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
# CV 0.866
# logit_test_pred = logit.predict_proba(X_test)[:, 1]
# write_to_submission_file(logit_test_pred, 'logit_subm3.csv') # 0.90804

In [72]:
# CV 0.868 + n-grams (1,2,3)
# logit_test_pred = logit.predict_proba(X_test)[:, 1]
# write_to_submission_file(logit_test_pred, 'logit_subm5.csv') # 0.91288

**Now we'll add some time features: indicators of morning, day, evening and night**

In [73]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [96]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

Wall time: 3.79 s


In [97]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

**Performing time series cross-validation, we see an improvement in ROC AUC**

In [98]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

Wall time: 1min 13s


In [99]:
cv_scores, cv_scores.mean()

(array([ 0.87652191,  0.75129605,  0.93062022,  0.978644  ,  0.90399606,
         0.93831555,  0.96249405,  0.92731303,  0.9488597 ,  0.94043603]),
 0.91584966011188196)

**Making a new submission, we notice a leaderboard score improvement as well. Correlated CV and LB improvements is a good justifications for added features being useful and CV scheme being correct**

In [100]:
logit.fit(X_train_new, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [101]:
# CV 0.916
logit_test_pred = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred, 'logit_subm6.csv') # 0.93842

**Now we tune regularization parameter C**

In [103]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', 
                                   n_jobs=-1, cv=time_split, verbose=4)

In [104]:
%%time
logit_grid_searcher.fit(X_train_new, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.1min finished


Wall time: 12min 19s


GridSearchCV(cv=TimeSeriesSplit(n_splits=10), error_score='raise',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.00000e-02,   2.78256e-02,   7.74264e-02,   2.15443e-01,
         5.99484e-01,   1.66810e+00,   4.64159e+00,   1.29155e+01,
         3.59381e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=4)

In [105]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.91737737315316847, {'C': 0.21544346900318834})

In [106]:
# CV 0.917
logit_test_pred = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred, 'logit_subm7.csv') # 0.94242