In [157]:
import warnings
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack, vstack
from __future__ import division, print_function
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from tqdm import tqdm 


warnings.filterwarnings('ignore')
PATH_TO_DATA = 'data/' 

In [2]:
time_cols = ['time%d' % i for i in range(1, 11)]
site_cols = ['site%d' % i for i in range(1, 11)]

train_df = pd.read_csv(PATH_TO_DATA + 'train_sessions.csv', index_col='session_id', parse_dates=time_cols)
test_df = pd.read_csv(PATH_TO_DATA + 'test_sessions.csv', index_col='session_id', parse_dates=time_cols)

with open(PATH_TO_DATA + 'site_dic.pkl', 'rb') as site_file:
     sites_dict = pickle.load(site_file)
        
id_sites_dict = {v: k for k, v in sites_dict.items()}

In [3]:
def split_data(X_data, y_data):
    grouped = train_df[['target']].groupby(by='target')
    
    train_ids = []
    valid_ids = []
    
    for g in tqdm(grouped.groups.keys()):
        train_shape = int(grouped.get_group(g).shape[0] * 0.7)

        ids_to_train = grouped.get_group(g).index[:train_shape]
        ids_to_valid = grouped.get_group(g).index[train_shape:]

        train_ids.extend(ids_to_train)
        valid_ids.extend(ids_to_valid)
        
    train_ids = np.array(train_ids) - 1
    valid_ids = np.array(valid_ids) - 1
        
    return X_data.tocsc()[train_ids], y_data[train_ids], X_data.tocsc()[valid_ids], y_data[valid_ids]

In [6]:
def get_dense_matrix(matrix):
    site_ids = list(id_sites_dict)
    X = matrix.values
    
    i = 0
    data = list()
    col = list()
    rows = list()
    for row in tqdm(X):
        unique, counts = np.unique(row, return_counts=True)
        dic = dict(zip(unique, counts))
        for k in dic:
            if k != 0:
                data.append(dic[k])
                rows.append(i)
                col.append(k-1)
            
        i += 1
    X_sparse = csr_matrix((data, (rows, col)), shape=(X.shape[0], len(site_ids)))
    return X_sparse

In [7]:
def score(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    valid_score = model.predict_proba(X_valid)
    print(roc_auc_score(y_valid, valid_score[:, 1:]))

In [8]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [9]:
def make_submission(model, X_train, y_train, X_test):
    print(X_train.shape)
    print(X_test.shape)
    model.fit(X_train, y_train)
    test_pred_proba = model.predict_proba(X_test)
    write_to_submission_file(test_pred_proba[:, 1:], 'result.csv')

In [10]:
def exptact_time_features(data):

    day_offset = 24
    month_offset = day_offset + 7
    morning_offset = month_offset + 12
    evening_offset = morning_offset + 1
    row_size = evening_offset + 2
    values = []

    for _, row in tqdm(data.iterrows()):
        
        time = row[time_cols[0]]

        r = np.zeros(row_size)
        r[time.hour] += 1
        r[day_offset + time.dayofweek] += 1
        r[month_offset + time.month] += 1
        r[morning_offset] = time.hour < 11
        r[evening_offset] = time.hour > 19
        values.append(r[1:])
        
    return csr_matrix(values)

In [11]:
def unique(data):
    return csr_matrix([[sum(1 for s in np.unique(row.values) if s != 0)] for _, row in tqdm(data.iterrows())])

In [12]:
str_train = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in train_df[site_cols].iterrows()]
str_test = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in test_df[site_cols].iterrows()]

In [11]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 2)).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 37.3 s


In [12]:
X_tmp_train = hstack((X_train_idf, 
                      exptact_time_features(train_df[time_cols]),
                      unique(train_df[site_cols].fillna(0).astype('int'))))

X_tmp_test = hstack((X_test_idf, 
                     exptact_time_features(test_df[time_cols]),
                     unique(test_df[site_cols].fillna(0).astype('int'))))

253561it [00:19, 13046.28it/s]
253561it [00:19, 13014.67it/s]
82797it [00:06, 12494.42it/s]
82797it [00:07, 11125.20it/s]


In [13]:
X_train, y_train, X_valid, y_valid = split_data(X_tmp_train, train_df['target'].values.astype('int64'))

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 39.19it/s]


((177491, 129384), (177491,), (76070, 129384), (76070,))

In [14]:
%%time
logit_c_values = np.logspace(-4, 2, 10)

skf = StratifiedKFold(n_splits=3, random_state=17)

logit_grid_searcher = LogisticRegressionCV(Cs=logit_c_values, cv=skf, n_jobs=-1)
logit_grid_searcher.fit(X_train, y_train)

Wall time: 4min 45s


In [15]:
logit_mean_cv_scores = next (iter (logit_grid_searcher.scores_.values())).mean(axis=0)
pd.Series(logit_mean_cv_scores, index=logit_grid_searcher.Cs_).sort_values(ascending=False)

21.544347     0.995014
100.000000    0.994929
4.641589      0.994867
1.000000      0.994011
0.215443      0.993081
0.046416      0.991966
0.010000      0.990946
0.002154      0.990946
0.000464      0.990946
0.000100      0.990946
dtype: float64

In [16]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y_train, X_valid, y_valid)

0.990094285571
Wall time: 28.6 s


In [17]:
%%time
y = train_df['target'].values.astype('int64')
make_submission(LogisticRegression(C=21.544347, n_jobs=-1), X_tmp_train, y, X_tmp_test)

(253561, 129384)
(82797, 129384)
Wall time: 52.6 s


In [15]:
X_train_sparse = get_dense_matrix(train_df[site_cols].fillna(0).astype('int'))
X_train_time_features = exptact_time_features(train_df[time_cols])
X_Train_unique = unique(train_df[site_cols].fillna(0).astype('int'))

100%|███████████████████████████████████████████████████████████████████████| 253561/253561 [00:08<00:00, 28589.36it/s]
253561it [00:19, 13235.60it/s]
253561it [00:23, 10843.80it/s]


In [16]:
X_tmp_train = hstack((X_train_sparse, X_train_time_features, X_Train_unique))

In [17]:
X_train, y_train, X_valid, y_valid = split_data(X_tmp_train, train_df['target'].values.astype('int64'))

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 38.07it/s]


In [18]:
pd.Series(y_train).value_counts()

0    175884
1      1607
dtype: int64

In [19]:
pd.Series(y_valid).value_counts()

0    75380
1      690
dtype: int64

In [29]:
y = train_df['target'].values.astype('int64')
X_train, X_valid, y_train, y_valid = train_test_split(X_tmp_train, y, train_size =0.7, stratify=y)

In [30]:
pd.Series(y_train).value_counts()

0    175884
1      1608
dtype: int64

In [31]:
pd.Series(y_valid).value_counts()

0    75380
1      689
dtype: int64

In [37]:
def score(model, X, y, train_size=0.7, random_states=[1, 13, 42]):
    result = []
    
    for rs in random_states:
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=train_size, stratify=y, random_state=rs)
        m = clone(model, safe=True)
        m.fit(X_train, y_train)
        valid_score = m.predict_proba(X_valid)
        result.append(roc_auc_score(y_valid, valid_score[:, 1:]))
        
    return result

In [32]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y_train, X_valid, y_valid)

0.988449523864
Wall time: 25.8 s


In [38]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_tmp_train, y)

Wall time: 1min 20s


[0.98693736928830067, 0.98689524117957172, 0.98942301434704705]

In [41]:
tmp = train_df[time_cols].head(3)

In [44]:
tmp

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-02-20 10:02:45,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2014-02-22 11:19:50,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,2014-02-22 11:20:16
3,2013-12-16 16:40:17,2013-12-16 16:40:18,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:20,2013-12-16 16:40:21,2013-12-16 16:40:22,2013-12-16 16:40:24


In [46]:
def extract_year_month(data):
    time = time_cols[0]
    values = [row[time].year * 100 + row[time].month for _, row in tqdm(data.iterrows())]
    series = pd.Series(values)
    return csr_matrix(pd.get_dummies(series))

In [47]:
X_train_year_month = extract_year_month(train_df[time_cols])

253561it [00:21, 11855.27it/s]


In [48]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_year_month))

In [49]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min 33s


[0.98709390563380672, 0.98705002539624109, 0.98953470774683538]

In [52]:
def extract_part_of_day(data):
    time = time_cols[0]
    values = [row[time].hour // 6 for _, row in tqdm(data.iterrows())]
    series = pd.Series(values)
    return csr_matrix(pd.get_dummies(series))

In [54]:
X_train_part_of_day = extract_part_of_day(train_df[time_cols])

253561it [00:17, 14888.16it/s]


In [55]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_part_of_day))

In [56]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min 29s


[0.98697543476862848, 0.98692448825322765, 0.98928734951427533]

In [60]:
def extract_weekend(data):
    time = time_cols[0]
    values = [[row[time].dayofweek > 4] for _, row in tqdm(data.iterrows())]
    return csr_matrix(values)

In [61]:
X_train_weekend = extract_weekend(train_df[time_cols])

253561it [00:17, 14645.33it/s]


In [62]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_weekend))

In [63]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min 21s


[0.98692604784043392, 0.98690698621902528, 0.98941088422433254]

In [64]:
tmp = train_df[time_cols].head(3)

In [65]:
tmp

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-02-20 10:02:45,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2014-02-22 11:19:50,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,2014-02-22 11:20:16
3,2013-12-16 16:40:17,2013-12-16 16:40:18,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:20,2013-12-16 16:40:21,2013-12-16 16:40:22,2013-12-16 16:40:24


In [111]:
def extract_duration(data):
    values = []
    time = time_cols[0]

    for _, row in tqdm(data.iterrows()):

        first = row[time]
        last = first

        for t, check in zip(time_cols, row.values == np.datetime64('NaT')):
            if check:
                break
            else:
                last = row[t]

        values.append([np.log1p(last.minute - first.minute)])

    return csr_matrix(np.nan_to_num(values))

In [112]:
X_train_duration = extract_duration(train_df[time_cols])

253561it [01:24, 2983.45it/s]


In [113]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_duration))

In [114]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min 46s


[0.98703509379280441, 0.98684183975838335, 0.98953501581344416]

In [115]:
def extract_week(data):
    time = time_cols[0]
    values = []
    
    for _, row in tqdm(data.iterrows()):
        
        r = np.zeros(53)
        r[row[time].week] = 1
        values.append(r)
        
    return csr_matrix(values)

In [116]:
X_train_weeks = extract_week(train_df[time_cols])

253561it [00:33, 7527.85it/s]


In [117]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_weeks))

In [118]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min 14s


[0.98941709369191244, 0.99013409946931674, 0.99101444793886118]

In [119]:
X = hstack((X_train_sparse, 
            X_train_time_features, 
            X_Train_unique, 
            X_train_year_month, 
            X_train_part_of_day, 
            X_train_weekend,
            X_train_duration,
            X_train_weeks))

In [120]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min 16s


[0.98949889500358323, 0.99018428544527759, 0.99094715463904026]

In [121]:
# def split_train_and_test(data, train_size):
#     return X.tocsc()[train_size:], X.tocsc()[:train_size]

In [124]:
# y = train_df['target'].values.astype('int64')
train_test_sites_df = pd.concat([train_df[site_cols].fillna(0).astype('int'), test_df[site_cols].fillna(0).astype('int')])
train_test_times_df = pd.concat([train_df[time_cols], test_df[time_cols]])

In [125]:
X_tmp_sparse = get_dense_matrix(train_test_sites_df)
X_tmp_time_features = exptact_time_features(train_test_times_df)
X_tmp_unique = unique(train_test_sites_df)

X_tmp_year_month = extract_year_month(train_test_times_df)
X_tmp_part_of_day = extract_part_of_day(train_test_times_df)
X_tmp_weekend = extract_weekend(train_test_times_df)
X_tmp_duration = extract_duration(train_test_times_df)
X_tmp_weeks = extract_week(train_test_times_df)

100%|███████████████████████████████████████████████████████████████████████| 336358/336358 [00:14<00:00, 23861.57it/s]
336358it [00:32, 10480.73it/s]
336358it [00:39, 8479.11it/s] 
336358it [00:39, 8593.59it/s] 
336358it [00:29, 11327.54it/s]
336358it [00:24, 13507.23it/s]
336358it [01:36, 3499.74it/s]
336358it [00:33, 10110.49it/s]


In [136]:
def split_train_and_test(data, train_size):
    return data.tocsc()[:train_size], data.tocsc()[train_size:]

In [138]:
X_tmp.shape, X_train.shape, X_test.shape, train_df.shape[0]

((336358, 48499), (253561, 48499), (82797, 48499), 253561)

In [152]:
X_tmp = hstack((X_tmp_sparse, 
                X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
#                 X_tmp_part_of_day, 
#                 X_tmp_weekend,
#                 X_tmp_duration,
#                 X_tmp_weeks
               ))

In [153]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [154]:
%%time
make_submission(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y, X_test)

(253561, 48441)
(82797, 48441)
Wall time: 44.3 s


In [155]:
str_train = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(train_df[site_cols].iterrows())]
str_test = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(test_df[site_cols].iterrows())]

253561it [00:12, 20328.86it/s]
82797it [00:04, 20555.77it/s]


In [183]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 5)).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 2min 11s


In [184]:
X_tmp = hstack((vstack((X_train_idf, X_test_idf)), 
                X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
                X_tmp_part_of_day, 
                X_tmp_weekend,
                X_tmp_duration,
                X_tmp_weeks
               ))

In [185]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [161]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y)

Wall time: 1min 39s


[0.99107908416418244, 0.99274585929596759, 0.99099991104576679]

In [166]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y)

Wall time: 2min 59s


[0.99118047658674513, 0.99264331162362274, 0.99086363007977774]

In [171]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y)

Wall time: 4min 42s


[0.99070253049763157, 0.99234645093788942, 0.99042369170850275]

In [177]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y)

Wall time: 6min 52s


[0.99023887099749275, 0.99203732535030054, 0.98967216321676998]

In [182]:
%%time
make_submission(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y, X_test)

(253561, 2836775)
(82797, 2836775)
Wall time: 4min 48s


In [186]:
%%time
make_submission(LogisticRegression(C=25, n_jobs=-1), X_train, y, X_test)

(253561, 1698311)
(82797, 1698311)
Wall time: 4min 14s


In [188]:
%%time
make_submission(LogisticRegression(C=19, n_jobs=-1), X_train, y, X_test)

(253561, 1698311)
(82797, 1698311)
Wall time: 3min 5s


0.95992

In [189]:
%%time
make_submission(LogisticRegression(C=15, n_jobs=-1), X_train, y, X_test)

(253561, 1698311)
(82797, 1698311)
Wall time: 3min 9s


 0.96001

In [190]:
%%time
make_submission(LogisticRegression(C=5, n_jobs=-1), X_train, y, X_test)

(253561, 1698311)
(82797, 1698311)
Wall time: 1min 52s


0.96000

In [187]:
X_tmp.shape

(336358, 1698311)