In [1]:
import warnings
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from __future__ import division, print_function
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import roc_auc_score
from tqdm import tqdm 


warnings.filterwarnings('ignore')
PATH_TO_DATA = 'data/' 

In [2]:
time_cols = ['time%d' % i for i in range(1, 11)]
site_cols = ['site%d' % i for i in range(1, 11)]

train_df = pd.read_csv(PATH_TO_DATA + 'train_sessions.csv', index_col='session_id', parse_dates=time_cols)
test_df = pd.read_csv(PATH_TO_DATA + 'test_sessions.csv', index_col='session_id', parse_dates=time_cols)

with open(PATH_TO_DATA + 'site_dic.pkl', 'rb') as site_file:
     sites_dict = pickle.load(site_file)
        
id_sites_dict = {v: k for k, v in sites_dict.items()}

In [3]:
def split_data(X_data, y_data):
    grouped = train_df[['target']].groupby(by='target')
    
    train_ids = []
    valid_ids = []
    
    for g in tqdm(grouped.groups.keys()):
        train_shape = int(grouped.get_group(g).shape[0] * 0.7)

        ids_to_train = grouped.get_group(g).index[:train_shape]
        ids_to_valid = grouped.get_group(g).index[train_shape:]

        train_ids.extend(ids_to_train)
        valid_ids.extend(ids_to_valid)
        
    train_ids = np.array(train_ids) - 1
    valid_ids = np.array(valid_ids) - 1
        
    return X_data.tocsc()[train_ids], y_data[train_ids], X_data.tocsc()[valid_ids], y_data[valid_ids]

In [4]:
def get_dense_matrix(matrix):
    site_ids = list(id_sites_dict)
    X = matrix.values
    
    i = 0
    data = list()
    col = list()
    rows = list()
    for row in tqdm(X):
        unique, counts = np.unique(row, return_counts=True)
        dic = dict(zip(unique, counts))
        for k in dic:
            if k != 0:
                data.append(dic[k])
                rows.append(i)
                col.append(k-1)
            
        i += 1
    X_sparse = csr_matrix((data, (rows, col)), shape=(X.shape[0], len(site_ids)))
    return X_sparse

In [5]:
def score(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    valid_score = model.predict_proba(X_valid)
    print(roc_auc_score(y_valid, valid_score[:, 1:]))

In [6]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [7]:
def make_submission(model, X_train, y_train, X_test):
    print(X_train.shape)
    print(X_test.shape)
    model.fit(X_train, y_train)
    test_pred_proba = model.predict_proba(X_test)
    write_to_submission_file(test_pred_proba[:, 1:], 'result.csv')

In [8]:
def exptact_time_features(data):

    day_offset = 24
    month_offset = day_offset + 7
    morning_offset = month_offset + 12
    evening_offset = morning_offset + 1
    row_size = evening_offset + 2
    values = []

    for _, row in tqdm(data.iterrows()):
        
        time = row[time_cols[0]]

        r = np.zeros(row_size)
        r[time.hour] += 1
        r[day_offset + time.dayofweek] += 1
        r[month_offset + time.month] += 1
        r[morning_offset] = time.hour < 11
        r[evening_offset] = time.hour > 19
        values.append(r[1:])
        
    return csr_matrix(values)

In [9]:
def unique(data):
    return csr_matrix([[sum(1 for s in np.unique(row.values) if s != 0)] for _, row in tqdm(data.iterrows())])

In [10]:
str_train = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in train_df[site_cols].iterrows()]
str_test = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in test_df[site_cols].iterrows()]

In [11]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 2)).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 37.3 s


In [12]:
X_tmp_train = hstack((X_train_idf, 
                      exptact_time_features(train_df[time_cols]),
                      unique(train_df[site_cols].fillna(0).astype('int'))))

X_tmp_test = hstack((X_test_idf, 
                     exptact_time_features(test_df[time_cols]),
                     unique(test_df[site_cols].fillna(0).astype('int'))))

253561it [00:19, 13046.28it/s]
253561it [00:19, 13014.67it/s]
82797it [00:06, 12494.42it/s]
82797it [00:07, 11125.20it/s]


In [13]:
X_train, y_train, X_valid, y_valid = split_data(X_tmp_train, train_df['target'].values.astype('int64'))

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 39.19it/s]


((177491, 129384), (177491,), (76070, 129384), (76070,))

In [14]:
%%time
logit_c_values = np.logspace(-4, 2, 10)

skf = StratifiedKFold(n_splits=3, random_state=17)

logit_grid_searcher = LogisticRegressionCV(Cs=logit_c_values, cv=skf, n_jobs=-1)
logit_grid_searcher.fit(X_train, y_train)

Wall time: 4min 45s


In [15]:
logit_mean_cv_scores = next (iter (logit_grid_searcher.scores_.values())).mean(axis=0)
pd.Series(logit_mean_cv_scores, index=logit_grid_searcher.Cs_).sort_values(ascending=False)

21.544347     0.995014
100.000000    0.994929
4.641589      0.994867
1.000000      0.994011
0.215443      0.993081
0.046416      0.991966
0.010000      0.990946
0.002154      0.990946
0.000464      0.990946
0.000100      0.990946
dtype: float64

In [16]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y_train, X_valid, y_valid)

0.990094285571
Wall time: 28.6 s


In [17]:
%%time
y = train_df['target'].values.astype('int64')
make_submission(LogisticRegression(C=21.544347, n_jobs=-1), X_tmp_train, y, X_tmp_test)

(253561, 129384)
(82797, 129384)
Wall time: 52.6 s
