In [340]:
import warnings
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier, LogisticRegression
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import accuracy_score
from tqdm import tqdm 


warnings.filterwarnings('ignore')
PATH_TO_DATA = 'data/' 

In [3]:
time_cols = ['time%d' % i for i in range(1, 11)]
site_cols = ['site%d' % i for i in range(1, 11)]

train_df = pd.read_csv(PATH_TO_DATA +'train_sessions_400users.csv', index_col='session_id', parse_dates=time_cols)

with open(PATH_TO_DATA + 'site_dic.pkl', 'rb') as site_file:
     sites_dict = pickle.load(site_file)
        
id_sites_dict = {v: k for k, v in sites_dict.items()}

In [346]:
def write_to_submission_file(predicted_labels, out_file, target='user_id', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(PATH_TO_DATA + out_file, index_label=index_label)

In [323]:
def split_data(X_data, y_data):
    grouped = new_data[['user_id']].groupby(by='user_id')
    
    train_ids = []
    valid_ids = []
    
    for g in tqdm(grouped.groups.keys()):
        train_shape = int(grouped.get_group(g).shape[0] * 0.7)

        ids_to_train = grouped.get_group(g).index[:train_shape]
        ids_to_valid = grouped.get_group(g).index[train_shape:]

        train_ids.extend(ids_to_train)
        valid_ids.extend(ids_to_valid)
        
    train_ids = np.array(train_ids) - 1
    valid_ids = np.array(valid_ids) - 1
        
    return X_data.tocsc()[train_ids], y_data[train_ids], X_data.tocsc()[valid_ids], y_data[valid_ids]

In [22]:
def extract_time_features(data):
    row_size = 26
    morning_offset = 24
    evening_offset = 25
    values = []
    
    for _, row in tqdm(data.iterrows()):
        
        time = row[time_cols[0]]
    
        r = np.zeros(26)
        r[time.hour - 1] += 1
        r[morning_offset] = time.hour < 11
        r[evening_offset] = time.hour > 19
        values.append(sh)
        
    return csr_matrix(values)

In [357]:
def get_dense_matrix(matrix):
    site_ids = list(id_sites_dict)
    X = matrix.values
    
    i = 0
    data = list()
    col = list()
    rows = list()
    for row in tqdm(X):
        unique, counts = np.unique(row, return_counts=True)
        dic = dict(zip(unique, counts))
        for k in dic:
            if (k == 0):
                continue
            
            data.append(dic[k])
            rows.append(i)
            col.append(k-1)
            
        i += 1
    X_sparse = csr_matrix((data, (rows, col)), shape=(X.shape[0], len(site_ids)))
    return X_sparse

In [53]:
def score(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    valid_score = model.predict(X_valid)
    print(accuracy_score(y_valid, valid_score))

In [361]:
X_tmp = hstack((X_train_idf, extract_time_features(train_df[time_cols])))
X_train, y_train, X_valid, y_valid = split_data(X_tmp, new_data['user_id'].values.astype('int64'))

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

100%|███████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 553.71it/s]


((378287, 36682), (378287,), (162387, 36682), (162387,))

In [362]:
%%time
score(SGDClassifier(random_state=17, n_jobs=-1, loss='log'), X_train, y_train, X_valid, y_valid)

0.256110402926
Wall time: 1min 45s


In [338]:
X = X_tmp
y = new_data['user_id'].values.astype('int64')

In [341]:
%%time
logit = LogisticRegression(C=2.11111111111, n_jobs=-1)
logit.fit(X, y)

Wall time: 7h 15min


In [342]:
test_df = pd.read_csv(PATH_TO_DATA + 'test_sessions_400users.csv', index_col='session_id', parse_dates=time_cols)
X_test = hstack((X_test_idf, extract_time_features(test_df[time_cols])))

In [345]:
logit_test_pred = logit.predict(X_test)
write_to_submission_file(logit_test_pred.astype(int), 'results.csv')

In [347]:
write_to_submission_file(logit_test_pred.astype(int), 'results.csv')

The best result which I can get is 0.20218 that is the 9 place out of 119. Not bad, but will see how it will go on private leaderboard. 