In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [3]:
# a helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                            target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                               index=np.arange(1, predicted_labels.shape[0] + 1),
                               columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

### Read training and test sets, sort train by session start time

In [4]:
train_df = pd.read_csv('../../../mlcourse_data/alice_competition/train_sessions.csv', 
                       index_col='session_id')
test_df = pd.read_csv('../../../mlcourse_data/alice_competition/test_sessions.csv',
                     index_col='session_id')

In [5]:
train_df['time1'].head()

session_id
1    2014-02-20 10:02:45
2    2014-02-22 11:19:50
3    2013-12-16 16:40:17
4    2014-03-28 10:52:12
5    2014-02-28 10:53:05
Name: time1, dtype: object

In [6]:
# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1,11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


### Transform data into format which can be fed into CountVectorizer

In [7]:
sites = ['site%s' % i for i in range(1,11)]
train_df[sites].fillna(0).astype('int').to_csv('../../../mlcourse_data/alice_competition/train_sessions_text.txt',
                                              sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('../../../mlcourse_data/alice_competition/test_sessions_text.txt',
                                              sep=' ', index=None, header=None)

In [8]:
!head -5 ../../../mlcourse_data/alice_competition/train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


### Fit CountVectorizer and transform data with it

In [9]:
%%time
cv = CountVectorizer()

with open('../../../mlcourse_data/alice_competition/train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('../../../mlcourse_data/alice_competition/test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)

print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)
CPU times: user 2.2 s, sys: 56.8 ms, total: 2.25 s
Wall time: 2.26 s


### Save train targets into a separate vector

In [10]:
y_train = train_df['target'].astype('int')

### train Logistic regression

In [11]:
logit = LogisticRegression(C=1, random_state=17)


In [12]:
%%time
cv_scores = cross_val_score(logit, X_train, y=y_train, cv=5, scoring='roc_auc')

CPU times: user 46.2 s, sys: 318 ms, total: 46.6 s
Wall time: 11.9 s


In [13]:
cv_scores

array([0.9138141 , 0.82974653, 0.87639947, 0.892229  , 0.91358382])

In [14]:
cv_scores.mean()

0.8851545834530248

In [15]:
%%time
logit.fit(X_train, y_train)

CPU times: user 12.1 s, sys: 98.2 ms, total: 12.2 s
Wall time: 3.12 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]
test_pred_logit1[:5]

array([2.42976477e-03, 5.10570240e-09, 1.88447109e-08, 3.00172231e-08,
       3.34389372e-05])

In [17]:
# CV 0.885
write_to_submission_file(test_pred_logit1, '../../../mlcourse_data/alice_competition/submissions/logit_subm1.txt')
# .90804 ROC AUC Public LB

### Time Features
 - hour when session started
 - morning
 - day
 - eve
 - night

In [18]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [19]:
%%time
X_train_with_time = add_time_features(train_df, X_train)
X_test_with_time = add_time_features(test_df, X_test)

CPU times: user 1.29 s, sys: 81.1 ms, total: 1.37 s
Wall time: 721 ms


In [21]:
X_train_with_time.shape, X_test_with_time.shape

((253561, 41596), (82797, 41596))

In [22]:
%%time
cv_scores = cross_val_score(logit, X_train_with_time, y=y_train, cv=5, scoring='roc_auc')

CPU times: user 39.9 s, sys: 248 ms, total: 40.1 s
Wall time: 10.1 s


In [23]:
cv_scores.mean()

0.930763091739961

In [24]:
%%time
logit.fit(X_train_with_time, y_train)

CPU times: user 8.69 s, sys: 53.6 ms, total: 8.75 s
Wall time: 2.21 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
test_pred_logit2 = logit.predict_proba(X_test_with_time)[:, 1]

In [27]:
# CV 0.931
write_to_submission_file(test_pred_logit2, '../../../mlcourse_data/alice_competition/submissions/logit_subm2.txt')
# .93565 ROC AUC Public LB