### Notes

In [68]:
# X_train.todense() # MemoryError: Unable to allocate 78.6 GiB for an array with shape (253561, 41592) and data type int64

### Task

In [83]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy.sparse import hstack
import pickle

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
%alias head powershell -command "& {Get-Content %s -Head 10}"

In [44]:
# helper function for writing predictions
def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels, index=np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

**Read training and test sets, sort set train set by session start time**

In [107]:
train_df = pd.read_csv('./data/train_sessions.csv')
test_df = pd.read_csv('./data/test_sessions.csv')

# Convert time1, ...,time10 to columns to datetime type
times = [f'time{s}' for s in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Extract labels
y_train = train_df.target.astype('int')

In [108]:
y_train.value_counts()

0    251264
1      2297
Name: target, dtype: int64

In [109]:
# Look at the training set
train_df

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54842,54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77291,77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12223,12224,50,2014-04-30 23:33:48,50.0,2014-04-30 23:33:49,48.0,2014-04-30 23:33:52,49.0,2014-04-30 23:33:52,48.0,...,2014-04-30 23:33:53,52.0,2014-04-30 23:33:54,49.0,2014-04-30 23:33:54,303.0,2014-04-30 23:33:57,304.0,2014-04-30 23:34:00,0
164437,164438,4207,2014-04-30 23:34:15,753.0,2014-04-30 23:34:16,753.0,2014-04-30 23:34:17,52.0,2014-04-30 23:34:18,50.0,...,2014-04-30 23:35:16,3346.0,2014-04-30 23:35:29,3359.0,2014-04-30 23:36:12,3346.0,2014-04-30 23:36:42,38.0,2014-04-30 23:37:13,0
12220,12221,52,2014-04-30 23:38:08,3346.0,2014-04-30 23:38:10,784.0,2014-04-30 23:38:13,784.0,2014-04-30 23:38:18,3346.0,...,2014-04-30 23:38:24,3324.0,2014-04-30 23:38:35,7330.0,2014-04-30 23:38:35,3594.0,2014-04-30 23:38:35,3329.0,2014-04-30 23:38:36,0
156967,156968,3328,2014-04-30 23:38:36,3324.0,2014-04-30 23:38:36,3599.0,2014-04-30 23:38:38,3413.0,2014-04-30 23:38:38,753.0,...,2014-04-30 23:38:40,3599.0,2014-04-30 23:38:40,3359.0,2014-04-30 23:39:07,3359.0,2014-04-30 23:39:08,3346.0,2014-04-30 23:39:53,0


In [110]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 21668 to 204761
Data columns (total 22 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   session_id  253561 non-null  int64         
 1   site1       253561 non-null  int64         
 2   time1       253561 non-null  datetime64[ns]
 3   site2       250098 non-null  float64       
 4   time2       250098 non-null  datetime64[ns]
 5   site3       246919 non-null  float64       
 6   time3       246919 non-null  datetime64[ns]
 7   site4       244321 non-null  float64       
 8   time4       244321 non-null  datetime64[ns]
 9   site5       241829 non-null  float64       
 10  time5       241829 non-null  datetime64[ns]
 11  site6       239495 non-null  float64       
 12  time6       239495 non-null  datetime64[ns]
 13  site7       237297 non-null  float64       
 14  time7       237297 non-null  datetime64[ns]
 15  site8       235224 non-null  float64       
 16

In [111]:
test_df.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
0,1,29,2014-10-04 11:19:53,35.0,2014-10-04 11:19:53,22.0,2014-10-04 11:19:54,321.0,2014-10-04 11:19:54,23.0,...,2211.0,2014-10-04 11:19:54,6730.0,2014-10-04 11:19:54,21.0,2014-10-04 11:19:54,44582.0,2014-10-04 11:20:00,15336.0,2014-10-04 11:20:00
1,2,782,2014-07-03 11:00:28,782.0,2014-07-03 11:00:53,782.0,2014-07-03 11:00:58,782.0,2014-07-03 11:01:06,782.0,...,782.0,2014-07-03 11:01:10,782.0,2014-07-03 11:01:23,782.0,2014-07-03 11:01:29,782.0,2014-07-03 11:01:30,782.0,2014-07-03 11:01:53
2,3,55,2014-12-05 15:55:12,55.0,2014-12-05 15:55:13,55.0,2014-12-05 15:55:14,55.0,2014-12-05 15:56:15,55.0,...,55.0,2014-12-05 15:56:17,55.0,2014-12-05 15:56:18,55.0,2014-12-05 15:56:19,1445.0,2014-12-05 15:56:33,1445.0,2014-12-05 15:56:36
3,4,1023,2014-11-04 10:03:19,1022.0,2014-11-04 10:03:19,50.0,2014-11-04 10:03:20,222.0,2014-11-04 10:03:21,202.0,...,3374.0,2014-11-04 10:03:22,50.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:23,3374.0,2014-11-04 10:03:23
4,5,301,2014-05-16 15:05:31,301.0,2014-05-16 15:05:32,301.0,2014-05-16 15:05:33,66.0,2014-05-16 15:05:39,67.0,...,69.0,2014-05-16 15:05:40,70.0,2014-05-16 15:05:40,68.0,2014-05-16 15:05:40,71.0,2014-05-16 15:05:40,167.0,2014-05-16 15:05:44


In [112]:
# Load websites dictionary
with open('./data/site_dic.pkl', mode='rb') as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


**Tranform data into format which can be fed into `CountVectorizer`**

In [113]:
sites = [f'site{s}' for s in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', sep=' ', index=0, header=0)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', sep=' ', index=0, header=0)

In [114]:
%head train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947
952 947 953 946 947 946 953 955 946 947
953 947 946 953 955 947 953 946 953 1033
946 947 954 953 946 954 946 956 957 956
946 956 946 946 955 954 946 946 946 948
948 946 948 784 49 53 812 982 52 52


**Fit `CountVectorizer` and transform data with it**

In [128]:
%%time
cv = CountVectorizer() # idea: add n-grams
with open('train_sessions_text.txt') as input_train_file:
    X_train = cv.fit_transform(input_train_file)
with open('test_sessions_text.txt') as input_test_file:
    X_test = cv.transform(input_test_file)
    
print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)
Wall time: 3.36 s


### Train Logisitic regression

In [129]:
logit = LogisticRegression(C=1, random_state=17, max_iter=500)

In [130]:
%%time
cv_scores = cross_val_score(estimator=logit, X=X_train, y=y_train, cv=5, scoring='roc_auc')
print(cv_scores)
cv_scores.mean()

[0.91384326 0.8319114  0.87616935 0.8912072  0.91287018]
Wall time: 28.2 s


0.8852002773592662

In [101]:
%%time
logit.fit(X_train, y_train)

Wall time: 2.86 s


LogisticRegression(C=1, random_state=17)

In [102]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]
test_pred_logit1

array([2.29625029e-03, 3.44054063e-09, 1.11962356e-08, ...,
       8.71038556e-03, 4.32088081e-04, 1.86131406e-05])

In [57]:
# CV 0.885
# write_to_submission_file(predicted_labels=test_pred_logit1, out_file='logit_subm1.txt') # 0.90703 ROC AUC Public LB

In [103]:
# CV 0.962
# write_to_submission_file(predicted_labels=test_pred_logit1, out_file='logit_subm2.txt') # 0.90703 ROC AUC Public LB

without sorting CV 0.963

### Time features
- morning
- day
- evening
- night

In [65]:
def add_time_features(time1_series, X_sparse):
    hour = time1_series.apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [80]:
%%time
X_train_with_time = add_time_features(train_df.time1.fillna(0), X_train)
X_test_with_time = add_time_features(test_df.time1.fillna(0), X_test)

Wall time: 1.58 s


In [81]:
X_train_with_time.shape, X_test_with_time.shape

((253561, 41596), (82797, 41596))

In [82]:
%%time
cv_scores = cross_val_score(estimator=logit, X=X_train_with_time, y=y_train, cv=5, scoring='roc_auc')
print(cv_scores)
cv_scores.mean()

[0.9248067  0.90742585 0.93214609 0.94361812 0.94776403]
Wall time: 12.3 s


0.9311521580955209