In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [3]:
# a helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                            target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                               index=np.arange(1, predicted_labels.shape[0] + 1),
                               columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

## Read training and test sets, sort train by session start time

In [4]:
train_df = pd.read_csv('../../../mlcourse_data/alice_competition/train_sessions.csv', 
                       index_col='session_id')
test_df = pd.read_csv('../../../mlcourse_data/alice_competition/test_sessions.csv',
                     index_col='session_id')

In [5]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [6]:
# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1,11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [7]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.csv', sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.csv', sep=' ', index=None, header=None)

In [8]:
train_df['target'].value_counts(normalize=True)

0    0.990941
1    0.009059
Name: target, dtype: float64

## Bag of Words

In [17]:
cv = CountVectorizer(ngram_range=(1,1))

In [18]:
%%time
with open('train_sessions_text.csv') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.csv') as inp_test_file:
    X_test = cv.transform(inp_test_file)

Wall time: 5.38 s


In [19]:
print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)


In [20]:
y_train = train_df['target'].astype('int')
y_train.shape

(253561,)

## Logistic Regression

In [21]:
logit = LogisticRegression(C=1, random_state=17)

In [22]:
%%time
cv_scores = cross_val_score(logit, X_train, y=y_train, cv=5, scoring='roc_auc', n_jobs=-1)
cv_scores

Wall time: 24.3 s


array([0.91381471, 0.829748  , 0.87640714, 0.89222801, 0.9135833 ])

In [23]:
cv_scores.mean()

0.8851562299835702

In [16]:
%%time
logit.fit(X_train, y_train)

Wall time: 9.06 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
test_pred_logit1 = logit.predict_proba(X_test)[:,1]
test_pred_logit1

array([2.42979669e-03, 5.10617111e-09, 1.88435239e-08, ...,
       8.84391281e-03, 4.74572372e-04, 2.26319367e-05])

In [18]:
# CV 0.885
write_to_submission_file(test_pred_logit1, '../../../mlcourse_data/alice_competition/submissions/logit_subm1.txt')
# .90804 ROC AUC Public LB

### Time features

In [24]:
def add_time_features(df, X_sparse):
    hour = df.apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [25]:
train_df['time1'].head()

session_id
21669    2013-01-12 08:05:57
54843    2013-01-12 08:37:23
77292    2013-01-12 08:50:13
114021   2013-01-12 08:50:17
146670   2013-01-12 08:50:20
Name: time1, dtype: datetime64[ns]

In [26]:
X_train_with_time = add_time_features(train_df['time1'].fillna(0), X_train)
X_test_with_time = add_time_features(test_df['time1'].fillna(0), X_test)

In [27]:
X_train.shape, X_train_with_time.shape

((253561, 41592), (253561, 41596))

In [29]:
%%time
logit_with_time = LogisticRegression(C=1, random_state=17)
cv_scores_with_time = cross_val_score(logit_with_time, X_train_with_time, y=y_train, cv=5, scoring='roc_auc', n_jobs=-1)
print(cv_scores_with_time)
print(cv_scores_with_time.mean())

[0.9251619  0.9053068  0.93167483 0.94366243 0.94800487]
0.9307621655838034
Wall time: 18.9 s


In [30]:
%%time
logit_with_time.fit(X_train_with_time, y_train)

Wall time: 7.57 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
test_pred_logit_with_time = logit_with_time.predict_proba(X_test_with_time)[:,1]

In [60]:
# CV 0.931
write_to_submission_file(test_pred_logit_with_time, '../../../mlcourse_data/alice_competition/submissions/subm_logit_with_time.txt')
# .93565 ROC AUC Public LB

## Time series cross-validation

### Simple Logit with BOW

In [33]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV

In [34]:
cv = CountVectorizer(ngram_range=(1,3), max_features=50000)

In [35]:
%%time
with open('train_sessions_text.csv') as inp_train_file:
    X_train2 = cv.fit_transform(inp_train_file)
with open('test_sessions_text.csv') as inp_test_file:
    X_test2 = cv.transform(inp_test_file)

Wall time: 23.5 s


In [36]:
X_train2.shape, X_test2.shape

((253561, 50000), (82797, 50000))

In [38]:
time_split = TimeSeriesSplit(n_splits=10)

In [39]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [40]:
logit2 = LogisticRegression(C=1, random_state=17, solver='liblinear', n_jobs=-1)

In [41]:
cv_score2 =cross_val_score(logit2, X_train2, y_train, scoring='roc_auc', cv=time_split, n_jobs=-1)

In [42]:
cv_score2, cv_score2.mean()

(array([0.83141992, 0.64671094, 0.87991757, 0.9631551 , 0.84221316,
        0.87840646, 0.94475732, 0.85321988, 0.92987836, 0.90752868]),
 0.8677207383703991)

In [43]:
logit2.fit(X_train2, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=17,
                   solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [46]:
# Time Series CV 0.868
test_pred_logit2 = logit2.predict_proba(X_test2)[:,1]
write_to_submission_file(test_pred_logit2, '../../../mlcourse_data/alice_competition/submissions/subm_logit2.txt')
# .91288 ROC AUC Public LB

### Logit with BOW + time features

In [47]:
X_train_with_time2 = add_time_features(train_df['time1'].fillna(0), X_train2)
X_test_with_time2 = add_time_features(test_df['time1'].fillna(0), X_test2)

In [49]:
X_train2.shape, X_train_with_time2.shape

((253561, 50000), (253561, 50004))

In [50]:
%%time
cv_scores_with_time2 = cross_val_score(logit2, X_train_with_time2, y_train, scoring='roc_auc', n_jobs=-1, cv=time_split)

Wall time: 53.1 s


In [51]:
cv_scores_with_time2, cv_scores_with_time2.mean()

(array([0.87652191, 0.75129621, 0.93061782, 0.978644  , 0.90398896,
        0.93831379, 0.96249244, 0.92731291, 0.94886187, 0.94043537]),
 0.9158485288135818)

In [53]:
C_values = np.logspace(-2,2,10)

logit_grid_searcher = GridSearchCV(logit2, param_grid={'C': C_values}, scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [54]:
%%time
logit_grid_searcher.fit(X_train_with_time2, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 10.1min finished


Wall time: 10min 19s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=-1, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=Fa

In [56]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9173783399548194, {'C': 0.21544346900318834})

In [61]:
# Time Series CV 0.917
test_pred_logit_with_time2 = logit_grid_searcher.predict_proba(X_test_with_time2)[:,1]
write_to_submission_file(test_pred_logit_with_time2, '../../../mlcourse_data/alice_competition/submissions/subm_logit_with_time2.txt')
# .94242 ROC AUC Public LB