In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\train_sessions.csv', 
                       index_col='session_id')
test_df = pd.read_csv('c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\test_sessions.csv', 
                       index_col='session_id')
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

train_df = train_df.sort_values('time1')
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [3]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [4]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [5]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [6]:
with open('c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\site_dic.pkl', 'rb') as input_file:
    site_dict = pickle.load(input_file)
    
sites_dist_df = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print('всего сайтов: ', sites_dist_df.shape[0])
sites_dist_df.head()

всего сайтов:  48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [7]:
y = train_df['target']
full_df = pd.concat([train_df.drop('target', axis=1), test_df])
idx_split = train_df.shape[0]

In [8]:
full_df.shape

(336358, 20)

In [9]:
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [10]:
sites_flatten = full_sites.values.flatten()

In [11]:
sites_flatten

array([  56,   55,    0, ..., 1098, 1098, 1098])

In [12]:
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0], sites_flatten, range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]

In [26]:
print(len(np.array([1] * sites_flatten.shape[0])))
print(len(sites_flatten))
print(len(np.array(range(0, sites_flatten.shape[0] + 10, 10))))

3363580
3363580
336359


In [27]:
print(full_sites_sparse)

  (0, 55)	1
  (0, 54)	1
  (1, 55)	1
  (1, 54)	1
  (1, 55)	1
  (1, 54)	1
  (2, 945)	1
  (2, 945)	1
  (2, 950)	1
  (2, 945)	1
  (2, 945)	1
  (2, 944)	1
  (2, 947)	1
  (2, 783)	1
  (2, 948)	1
  (2, 945)	1
  (3, 944)	1
  (3, 947)	1
  (3, 948)	1
  (3, 947)	1
  (3, 944)	1
  (3, 945)	1
  (3, 946)	1
  (3, 944)	1
  (3, 945)	1
  :	:
  (336355, 6778)	1
  (336355, 29)	1
  (336355, 20)	1
  (336355, 22)	1
  (336355, 6779)	1
  (336356, 5827)	1
  (336356, 22)	1
  (336356, 20)	1
  (336356, 803)	1
  (336356, 20)	1
  (336356, 3349)	1
  (336356, 22)	1
  (336356, 893)	1
  (336356, 20)	1
  (336356, 960)	1
  (336357, 20)	1
  (336357, 1097)	1
  (336357, 1097)	1
  (336357, 1097)	1
  (336357, 1097)	1
  (336357, 1097)	1
  (336357, 1097)	1
  (336357, 1097)	1
  (336357, 1097)	1
  (336357, 1097)	1


In [529]:
def get_auc_lr_valid(X, y, C=1.0, ration=0.9, seed=17):
    train_len = int(ration * X.shape[0])
    
    X_train = X[:train_len, :]
    X_test = X[train_len:, :]
    y_train = y[:train_len]
    y_test = y[train_len:]
    
    logit = LogisticRegression(n_jobs=-1, C=C, random_state=17)
    logit.fit(X_train, y_train)

    train_pred = logit.predict_proba(X_test)[:, 1]
    
    return roc_auc_score(y_test, train_pred)

In [487]:
X_train_sparce = full_sites_sparse[:idx_split]

In [530]:
%%time
get_auc_lr_valid(X_train_sparce, y)

Wall time: 3.9 s


0.9195295772453744

In [490]:
def write_to_file(predict_labels, out_file, target='target', index_label='session_id'):
    predict_df = pd.DataFrame(predict_labels, 
                              index=np.arange(1, predict_labels.shape[0] + 1), 
                              columns=[target])
    predict_df.to_csv(out_file, index_label=index_label)

In [491]:
logit = LogisticRegression(n_jobs=-1, C=1.0, random_state=17)
logit.fit(X_train_sparce, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [492]:
X_test_sparse = full_sites_sparse[idx_split:]
predict = logit.predict_proba(X_test_sparse)[:, 1]
write_to_file(predict, 'c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\sessions_out.csv')

### Улучшение модели, построение новых признаков

In [493]:
new_feat_train = pd.DataFrame(index=train_df.index)
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
new_feat_test = pd.DataFrame(index=test_df.index)
new_feat_test['year_month'] = test_df['time1'].dt.year * 100 + test_df['time1'].dt.month

In [494]:
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1, 1))
new_feat_train['year_month_scaler'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1, 1))

scaler.fit(new_feat_test['year_month'].values.reshape(-1, 1))
new_feat_test['year_month_scaler'] = scaler.transform(new_feat_test['year_month'].values.reshape(-1, 1))

In [495]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparce, 
                             new_feat_train['year_month_scaler'].values.reshape(-1, 1)]))

In [496]:
%%time
get_auc_lr_valid(X_train_sparse_new, y)

Wall time: 3.86 s


0.9196930599986233

In [497]:
logit.fit(X_train_sparse_new, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [498]:
X_test_sparse_new = csr_matrix(hstack([X_test_sparce, 
                             new_feat_test['year_month_scaler'].values.reshape(-1, 1)]))
predict = logit.predict_proba(X_test_sparse_new)[:, 1]
write_to_file(predict, 'c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\sessions_out_logit2.csv')

#### Добавьте два новых признака: start_hour и morning.

In [499]:
new_feat_train['start_hour'] = train_df['time1'].apply(lambda ts: ts.hour)
new_feat_train['start_month'] = train_df['time1'].apply(lambda ts: ts.month)
new_feat_train['morning'] = new_feat_train['start_hour'].apply(lambda hour: 1 if hour <= 11 else 0)

new_feat_test['start_hour'] = test_df['time1'].dt.hour
new_feat_test['start_month'] = test_df['time1'].dt.month
new_feat_test['morning'] = new_feat_test['start_hour'].apply(lambda hour: 1 if hour <= 11 else 0)

In [500]:
scaler.fit(new_feat_train['start_hour'].values.reshape(-1, 1))
new_feat_train['start_hour_scaler'] = scaler.transform(new_feat_train['start_hour'].values.reshape(-1, 1))
scaler.fit(new_feat_train['start_month'].values.reshape(-1, 1))
new_feat_train['start_month_scaler'] = scaler.transform(new_feat_train['start_month'].values.reshape(-1, 1))
scaler.fit(new_feat_train['morning'].values.reshape(-1, 1))
new_feat_train['morning_scaler'] = scaler.transform(new_feat_train['morning'].values.reshape(-1, 1))

scaler.fit(new_feat_test['start_hour'].values.reshape(-1, 1))
new_feat_test['start_hour_scaler'] = scaler.transform(new_feat_test['start_hour'].values.reshape(-1, 1))
scaler.fit(new_feat_test['start_month'].values.reshape(-1, 1))
new_feat_test['start_month_scaler'] = scaler.transform(new_feat_test['start_month'].values.reshape(-1, 1))
scaler.fit(new_feat_test['morning'].values.reshape(-1, 1))
new_feat_test['morning_scaler'] = scaler.transform(new_feat_test['morning'].values.reshape(-1, 1))

In [501]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparce, 
                             new_feat_train['start_month_scaler'].values.reshape(-1, 1)]))
X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                             new_feat_train['start_hour_scaler'].values.reshape(-1, 1)]))

In [502]:
%%time
get_auc_lr_valid(X_train_sparse_new, y)

Wall time: 3.68 s


0.9571558197444899

In [503]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparce, 
                             new_feat_train['start_month_scaler'].values.reshape(-1, 1)]))
X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                             new_feat_train['morning_scaler'].values.reshape(-1, 1)]))

In [504]:
%%time
get_auc_lr_valid(X_train_sparse_new, y)

Wall time: 3.29 s


0.9482012519488111

In [505]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparce, 
                             new_feat_train['start_month_scaler'].values.reshape(-1, 1)]))
X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                             new_feat_train['morning_scaler'].values.reshape(-1, 1)]))
X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                             new_feat_train['start_hour_scaler'].values.reshape(-1, 1)]))

In [528]:
%%time
get_auc_lr_valid(X_train_sparse_new, y)

Wall time: 2.38 s


0.9607264762960575

In [507]:
X_test_sparse_new = csr_matrix(hstack([X_test_sparce, 
                             new_feat_test['start_month_scaler'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                             new_feat_test['morning_scaler'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                             new_feat_test['start_hour_scaler'].values.reshape(-1, 1)]))

In [508]:
logit.fit(X_train_sparse_new, y)

predict = logit.predict_proba(X_test_sparse_new)[:, 1]
write_to_file(predict, 'c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\sessions_out_logit3.csv')

In [509]:
X_test_sparse_new.shape, X_train_sparse_new.shape

((82797, 48374), (253561, 48374))

In [463]:
params_grid = {'C' : np.logspace(-3, 1, 10)}
grid = GridSearchCV(logit, params_grid, cv=5, n_jobs=-1)
grid.fit(X_train_sparse_new, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([1.00000e-03, 2.78256e-03, 7.74264e-03, 2.15443e-02, 5.99484e-02,
       1.66810e-01, 4.64159e-01, 1.29155e+00, 3.59381e+00, 1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [510]:
grid.best_params_

{'C': 0.001}

In [525]:
logit = LogisticRegression(C=0.001, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [512]:
predict = logit.predict_proba(X_test_sparse_new)[:, 1]
write_to_file(predict, 'c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\sessions_out_logit3.csv')

In [521]:
params = []
result = []
for i in np.logspace(-3, 1, 10):

    train_len = int(0.9 * X_train_sparse_new.shape[0])
    
    X_train = X_train_sparse_new[:train_len, :]
    X_test = X_train_sparse_new[train_len:, :]
    y_train = y[:train_len]
    y_test = y[train_len:]
    
    logit = LogisticRegression(n_jobs=-1, C=i, random_state=17)
    logit.fit(X_train, y_train)

    train_pred = logit.predict_proba(X_test)[:, 1]
    
    res = roc_auc_score(y_test, train_pred)
    params.append(i)
    result.append(res)

In [524]:
print(params)
print(result)

[0.001, 0.0027825594022071257, 0.007742636826811269, 0.021544346900318832, 0.05994842503189409, 0.1668100537200059, 0.46415888336127775, 1.2915496650148828, 3.593813663804626, 10.0]
[0.8190159123213162, 0.8957411006812234, 0.9389828172531516, 0.9561723570137272, 0.9603342988325851, 0.9607264762960575, 0.9596231564036329, 0.9577774466862061, 0.954776941897354, 0.9502130106307567]


In [531]:
logit = LogisticRegression(C=0.96072, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y)

LogisticRegression(C=0.96072, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=17,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [532]:
predict = logit.predict_proba(X_test_sparse_new)[:, 1]
write_to_file(predict, 'c:\\Users\\Vladislav.Klochkov\\Desktop\\mlcourse.ai-master\\data\\sessions_out_logit4.csv')