In [22]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set()

In [23]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [24]:
times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv('train_sessions.csv', index_col='session_id', parse_dates=times)
test_df = pd.read_csv('test_sessions.csv', index_col='session_id', parse_dates=times)
train_df = train_df.sort_values(by='time1')

sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').\
                            to_csv('train_sessions_text.txt', sep=' ', index=None, header=None)

test_df[sites].fillna(0).astype('int').\
                            to_csv('test_sessions_text.txt', sep=' ', index=None, header=None)

In [25]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)

with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
    
y_train = train_df['target'].astype('int').values

Wall time: 28.5 s


In [26]:
y_train = train_df['target']

full_df = pd.concat([train_df.drop('target', axis=1), test_df])

idx_split = train_df.shape[0]

In [27]:
def cross_val_score_lr(X_train, y_train, C = 1):
    time_split = TimeSeriesSplit(n_splits = 10)
    logit = LogisticRegression(C = C, random_state = 17, solver = 'liblinear')
    
    cv_scores = cross_val_score(logit, X_train, y_train, cv = time_split, scoring = 'roc_auc', n_jobs=1)
    return cv_scores

In [28]:
%%time
scores = cross_val_score_lr(X_train, y_train)

Wall time: 1min 28s


In [29]:
scores, scores.mean()

(array([0.83141992, 0.64671142, 0.87992077, 0.9631551 , 0.84221742,
        0.87840646, 0.94476054, 0.85321691, 0.92987691, 0.90752702]),
 0.8677212449964109)

### Adding new features ###

In [1]:
def add_time_features(df, X_sparse):
    
    new_feat = pd.DataFrame(index = df.index)
    
    new_feat['start_hour'] = df['time1'].apply(lambda ts: ts.hour)
    #new_feat['morning'] = new_feat['start_hour'].apply(lambda hour: 1 if (hour >= 7 and hour <= 11) else -1)
    #new_feat['day'] = new_feat['start_hour'].apply(lambda hour: 1 if (hour >= 12 and hour <= 18) else -1)
    #new_feat['evening'] = new_feat['start_hour'].apply(lambda hour: 1 if (hour >= 19 and hour <= 23) else -1)
    #new_feat['night'] = new_feat['start_hour'].apply(lambda hour: 1 if (hour >= 0 or hour <= 6) else -1)
    
    for i in range(0, 24):
        new_feat['hour%s' % i] = df['time1'].apply(lambda ts: 1 if (ts.hour == i) else \
                                (0 if (i in [16, 17, 18, 13, 12, 10]) else -1))
    
    new_feat['year'] = df['time1'].apply(lambda ts: ts.year).astype('float64') * 100
    new_feat['month'] = df['time1'].apply(lambda ts: ts.month).astype('float64')
    new_feat['year_month'] = new_feat['year'] + new_feat['month']
    scaled_month = StandardScaler().fit_transform(new_feat[['year_month']])    
    
    new_feat['weekday'] = df['time1'].apply(lambda ts: ts.weekday())
    new_feat['duration'] = df[times].max(axis=1) - df[times].min(axis=1)
    scaled_duration = StandardScaler().fit_transform(new_feat[['duration']])
    
    hours = ['hour%s' % i for i in range(0, 24)]
    
    X = hstack([X_sparse,
                new_feat[hours].values.reshape(-1, 24),
                scaled_month.reshape(-1, 1),
                new_feat['weekday'].values.reshape(-1, 1),
                scaled_duration.reshape(-1, 1)
               ])

    return X

In [46]:
%%time
X_train_new = add_time_features(train_df, X_train)
X_test_new = add_time_features(test_df, X_test)

Wall time: 1min 8s


In [38]:
%%time
scores = cross_val_score_lr(X_train_new, y_train, C=0.21544346900318834)

Wall time: 1min 22s


In [39]:
scores, scores.mean()

(array([0.70637378, 0.79189891, 0.97583175, 0.96016022, 0.93665852,
        0.97460733, 0.91460119, 0.95727549, 0.97530845, 0.97908812]),
 0.9171803768067142)

In [50]:
logit = LogisticRegression(C=0.21544346900318834, random_state=17, solver='liblinear')
logit.fit(X_train_new, y_train)

y_test = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(y_test, 'subm.csv') # 0.9171803768067142 my score, 0.95378 public LB score

In [51]:
logit.coef_[0][-30:]

array([-0.00413191, -0.00299359, -0.00512731,  1.22616832,  1.22616832,
        1.22616832,  1.22616832,  1.22616832,  1.22616832,  1.22616832,
        0.51980673, -1.2406601 , -0.53062623, -2.67722814, -1.33270465,
        1.32094346,  0.98424056, -1.1741403 , -0.24360761,  2.62668123,
        2.47454044,  2.49684017,  0.09223302,  0.33191412, -0.12119003,
        0.08746918,  0.19498534, -0.54908495, -0.33852789, -0.241325  ])