In [1]:
import warnings
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.metrics import accuracy_score
from tqdm import tqdm 


warnings.filterwarnings('ignore')
PATH_TO_DATA = 'data/' 

In [2]:
time_cols = ['time%d' % i for i in range(1, 11)]
site_cols = ['site%d' % i for i in range(1, 11)]

train_df = pd.read_csv(PATH_TO_DATA +'train_sessions_400users.csv', index_col='session_id', parse_dates=time_cols)
test_df = pd.read_csv(PATH_TO_DATA + 'test_sessions_400users.csv', index_col='session_id', parse_dates=time_cols)

with open(PATH_TO_DATA + 'site_dic.pkl', 'rb') as site_file:
     sites_dict = pickle.load(site_file)
        
id_sites_dict = {v: k for k, v in sites_dict.items()}

In [3]:
def write_to_submission_file(predicted_labels, out_file, target='user_id', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(PATH_TO_DATA + out_file, index_label=index_label)

In [4]:
def split_data(X_data, y_data):
    grouped = train_df[['user_id']].groupby(by='user_id')
    
    train_ids = []
    valid_ids = []
    
    for g in tqdm(grouped.groups.keys()):
        train_shape = int(grouped.get_group(g).shape[0] * 0.7)

        ids_to_train = grouped.get_group(g).index[:train_shape]
        ids_to_valid = grouped.get_group(g).index[train_shape:]

        train_ids.extend(ids_to_train)
        valid_ids.extend(ids_to_valid)
        
    train_ids = np.array(train_ids) - 1
    valid_ids = np.array(valid_ids) - 1
        
    return X_data.tocsc()[train_ids], y_data[train_ids], X_data.tocsc()[valid_ids], y_data[valid_ids]

In [16]:
def extract_time_features(data):
#     row_size = 26
#     morning_offset = 24
#     evening_offset = 25
    values = []
    
    for _, row in tqdm(data.iterrows()):
        
        time = row[time_cols[0]]
    
        r = np.zeros(24)
        r[time.hour - 1] += 1
#         r[morning_offset] = time.hour < 11
#         r[evening_offset] = time.hour > 19
        values.append(r)
        
    return csr_matrix(values)

In [6]:
def get_dense_matrix(matrix):
    site_ids = list(id_sites_dict)
    X = matrix.values
    
    i = 0
    data = list()
    col = list()
    rows = list()
    for row in tqdm(X):
        unique, counts = np.unique(row, return_counts=True)
        dic = dict(zip(unique, counts))
        for k in dic:
            if (k == 0):
                continue
            
            data.append(dic[k])
            rows.append(i)
            col.append(k-1)
            
        i += 1
    X_sparse = csr_matrix((data, (rows, col)), shape=(X.shape[0], len(site_ids)))
    return X_sparse

In [7]:
def score(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    valid_score = model.predict(X_valid)
    print(accuracy_score(y_valid, valid_score))

In [8]:
def extract_year_month(data):
    time = time_cols[0]
    values = [row[time].year * 100 + row[time].month for _, row in tqdm(data.iterrows())]
    series = pd.Series(values)
    return csr_matrix(pd.get_dummies(series))

In [9]:
def extract_part_of_day(data):
    time = time_cols[0]
    values = [row[time].hour // 6 for _, row in tqdm(data.iterrows())]
    series = pd.Series(values)
    return csr_matrix(pd.get_dummies(series))

In [10]:
def extract_weekend(data):
    time = time_cols[0]
    values = [[row[time].dayofweek > 4] for _, row in tqdm(data.iterrows())]
    return csr_matrix(values)

In [11]:
def extract_duration(data):
    values = []
    time = time_cols[0]

    for _, row in tqdm(data.iterrows()):

        first = row[time]
        last = first

        for t, check in zip(time_cols, row.values == np.datetime64('NaT')):
            if check:
                break
            else:
                last = row[t]

        values.append([np.log1p(last.minute - first.minute)])

    return csr_matrix(np.nan_to_num(values))

In [12]:
def extract_week(data):
    time = time_cols[0]
    values = []
    
    for _, row in tqdm(data.iterrows()):
        
        r = np.zeros(53)
        r[row[time].week] = 1
        values.append(r)
        
    return csr_matrix(values)

In [13]:
def split_train_and_test(data, train_size):
    return data.tocsc()[:train_size], data.tocsc()[train_size:]

In [14]:
train_test_sites_df = pd.concat([train_df[site_cols].fillna(0).astype('int'), test_df[site_cols].fillna(0).astype('int')])
train_test_times_df = pd.concat([train_df[time_cols], test_df[time_cols]])

In [18]:
X_tmp_sparse = get_dense_matrix(train_test_sites_df)
X_tmp_time_features = extract_time_features(train_test_times_df)
# X_tmp_unique = unique(train_test_sites_df)

X_tmp_year_month = extract_year_month(train_test_times_df)
X_tmp_part_of_day = extract_part_of_day(train_test_times_df)
X_tmp_weekend = extract_weekend(train_test_times_df)
X_tmp_duration = extract_duration(train_test_times_df)
X_tmp_weeks = extract_week(train_test_times_df)

100%|███████████████████████████████████████████████████████████████████████| 229266/229266 [00:07<00:00, 30830.70it/s]
229266it [00:15, 15027.50it/s]
229266it [00:18, 12133.23it/s]
229266it [00:14, 16107.96it/s]
229266it [00:15, 14983.87it/s]
229266it [01:02, 3651.85it/s]
229266it [00:23, 9725.33it/s] 


In [19]:
X_tmp_sparse = hstack((X_tmp_sparse, 
                       X_tmp_time_features, 
#                      X_tmp_unique, 
                       X_tmp_year_month, 
                       X_tmp_part_of_day, 
                       X_tmp_weekend,
                       X_tmp_duration,
                       X_tmp_weeks
               ))

In [21]:
y = train_df['user_id'].values.astype('int64')
X_train_sparse, X_test_sparse = split_train_and_test(X_tmp_sparse, train_df.shape[0])

In [30]:
%%time
logit = LogisticRegression(C=2.11111111111, n_jobs=-1)
logit.fit(X_train_sparse, y)

Wall time: 1h 4min 52s


In [31]:
logit_test_pred = logit.predict(X_test_sparse)
write_to_submission_file(logit_test_pred.astype(int), 'results1.csv')

In [22]:
str_train = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(train_df[site_cols].iterrows())]
str_test = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(test_df[site_cols].iterrows())]

182793it [00:07, 25877.78it/s]
46473it [00:02, 21470.21it/s]


In [25]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 3)).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 44.9 s


In [26]:
X_tmp_idf = hstack((vstack((X_train_idf, X_test_idf)), 
                            X_tmp_time_features, 
#                             X_tmp_unique, 
                            X_tmp_year_month, 
                            X_tmp_part_of_day, 
                            X_tmp_weekend,
                            X_tmp_duration,
                            X_tmp_weeks
                           ))

In [27]:
y = train_df['user_id'].values.astype('int64')
X_train_idf, X_test_idf = split_train_and_test(X_tmp_idf, train_df.shape[0])

In [32]:
%%time
logit_idf = LogisticRegression(C=2.11111111111, n_jobs=-1)
logit_idf.fit(X_train_idf, y)

Wall time: 2h 3min 20s


In [33]:
logit_test_pred = logit_idf.predict(X_test_idf)
write_to_submission_file(logit_test_pred.astype(int), 'results2.csv')

In [29]:
X_train_sparse.shape, X_train_idf.shape, y.shape

((182793, 36762), (182793, 308860), (182793,))

In [None]:
X_train_sparse = get_dense_matrix(train_df[site_cols].fillna(0).astype('int'))

In [361]:
X_tmp = hstack((X_train_sparse, extract_time_features(train_df[time_cols])))
X_train, y_train, X_valid, y_valid = split_data(X_tmp, train_df['user_id'].values.astype('int64'))

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

100%|███████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 553.71it/s]


((378287, 36682), (378287,), (162387, 36682), (162387,))

In [362]:
%%time
score(SGDClassifier(random_state=17, n_jobs=-1, loss='log'), X_train, y_train, X_valid, y_valid)

0.256110402926
Wall time: 1min 45s


In [338]:
X = X_tmp
y = new_data['user_id'].values.astype('int64')

In [341]:
%%time
logit = LogisticRegression(C=2.11111111111, n_jobs=-1)
logit.fit(X, y)

Wall time: 7h 15min


In [342]:
X_test = hstack((X_test_idf, extract_time_features(test_df[time_cols])))

In [345]:
logit_test_pred = logit.predict(X_test)
write_to_submission_file(logit_test_pred.astype(int), 'results.csv')

In [347]:
write_to_submission_file(logit_test_pred.astype(int), 'results.csv')

The best result which I can get is 0.20218 that is the 9 place out of 119. Not bad, but will see how it will go on private leaderboard. 