# Classification with time

In [3]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [4]:
train_df = pd.read_csv("train_sessions.csv", index_col="session_id")
test_df = pd.read_csv("test_sessions.csv", index_col="session_id")

In [5]:
times = ["time%s" % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [6]:
train_df = train_df.sort_values(by="time1")

In [329]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [8]:
sites = ["site%s" % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype("int")
test_df[sites] = test_df[sites].fillna(0).astype("int")

with open(r"site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

sites_dict_df = pd.DataFrame(
    list(site_dict.keys()), index=list(site_dict.values()), columns=["site"]
)
print(u"всего сайтов:", sites_dict_df.shape[0])
sites_dict_df.head()

всего сайтов: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [9]:
test_df.head(3)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,29,2014-10-04 11:19:53,35,2014-10-04 11:19:53,22,2014-10-04 11:19:54,321,2014-10-04 11:19:54,23,2014-10-04 11:19:54,2211,2014-10-04 11:19:54,6730,2014-10-04 11:19:54,21,2014-10-04 11:19:54,44582,2014-10-04 11:20:00,15336,2014-10-04 11:20:00
2,782,2014-07-03 11:00:28,782,2014-07-03 11:00:53,782,2014-07-03 11:00:58,782,2014-07-03 11:01:06,782,2014-07-03 11:01:09,782,2014-07-03 11:01:10,782,2014-07-03 11:01:23,782,2014-07-03 11:01:29,782,2014-07-03 11:01:30,782,2014-07-03 11:01:53
3,55,2014-12-05 15:55:12,55,2014-12-05 15:55:13,55,2014-12-05 15:55:14,55,2014-12-05 15:56:15,55,2014-12-05 15:56:16,55,2014-12-05 15:56:17,55,2014-12-05 15:56:18,55,2014-12-05 15:56:19,1445,2014-12-05 15:56:33,1445,2014-12-05 15:56:36


In [10]:
y_train = train_df["target"]

full_df = pd.concat([train_df.drop("target", axis=1), test_df])

idx_split = train_df.shape[0]

In [11]:
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [12]:
from scipy.sparse import csr_matrix

In [13]:
sites_flatten =  full_sites.values.flatten()

In [14]:
full_sites_sparse = csr_matrix(
    (
        [1] * sites_flatten.shape[0],
        sites_flatten,
        range(0, sites_flatten.shape[0] + 10, 10),
    )
)[:, 1:]

In [15]:
full_sites_sparse.shape

(336358, 48371)

In [16]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]

In [17]:
X_train_sparse.shape

(253561, 48371)

In [18]:
X_test_sparse.shape

(82797, 48371)

In [28]:
X_train_sparse

<253561x48371 sparse matrix of type '<class 'numpy.intc'>'
	with 2412880 stored elements in Compressed Sparse Row format>

In [32]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    train_len = int(ratio*X.shape[0])
    X_train = X[:train_len,:]
    X_valid = X[train_len:,:]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    logit = LogisticRegression(C=C, n_jobs=-1, random_state = seed)
    logit.fit(X_train, y_train)
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, valid_pred)

In [33]:
get_auc_lr_valid(X_train_sparse, y_train)

0.9197951046350002

In [34]:
def write_to_submission_file(
    predicted_labels, out_file, target="target", index_label="session_id"
):
    predicted_df = pd.DataFrame(
        predicted_labels,
        index=np.arange(1, predicted_labels.shape[0] + 1),
        columns=[target],
    )
    predicted_df.to_csv(out_file, index_label=index_label)

In [80]:
X_train_sparse.shape

(253561, 48371)

In [35]:
logit = LogisticRegression(n_jobs=-1, random_state = 17)
logit.fit(X_train_sparse, y_train)
test_pred = logit.predict_proba(X_test_sparse)[:, 1]

In [37]:
test_pred.shape

(82797,)

In [43]:
pd.Series(test_pred, index=range(1,test_pred.shape[0]+1), name='target').to_csv('first_test.csv',header=True, index_label='session_id')

# Улучшаем модель с помощью даты и времни

In [44]:
time = ['time%d' % i for i in range(1,11)]
train_df[time].head()

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,2013-01-12 08:05:57,2013-01-12 08:05:57,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
54843,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,NaT,NaT,NaT,NaT,NaT,NaT
77292,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114021,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146670,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22


In [53]:
new_feat_train = pd.DataFrame(index=train_df.index)
new_train_test = pd.DataFrame(index=test_df.index)

In [92]:
new_feat_train['year_month'] = train_df['time1'].apply(lambda x: x.year*100 + x.month)
new_train_test['year_month'] = test_df['time1'].apply(lambda x: x.year*100 + x.month)

In [93]:
train_df['time1'].apply(lambda x: x.year*100 + x.month)

session_id
21669     201301
54843     201301
77292     201301
114021    201301
146670    201301
           ...  
12224     201404
164438    201404
12221     201404
156968    201404
204762    201404
Name: time1, Length: 253561, dtype: int64

In [95]:
new_train_test.head()

Unnamed: 0_level_0,year_month,year_month_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,201410,0.822948
2,201407,0.752287
3,201412,0.870055
4,201411,0.846501
5,201405,0.705179


Необходимо привести к схожему с остальными данными виду, иначе модель будет работать некорректно

In [65]:
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1,1))
new_feat_train['year_month_scaled'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1,1))
new_train_test['year_month_scaled'] = scaler.transform(new_train_test['year_month'].values.reshape(-1,1))

In [69]:
new_feat_train.year_month_scaled.values

array([-1.74440496, -1.74440496, -1.74440496, ...,  0.68162559,
        0.68162559,  0.68162559])

In [83]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, new_feat_train.year_month_scaled.values.reshape(-1,1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, new_train_test.year_month_scaled.values.reshape(-1,1)]))

In [84]:
X_train_sparse_new.shape

(253561, 48372)

In [85]:
get_auc_lr_valid(X_train_sparse_new, y_train)

0.9198903563591923

In [86]:
logit.fit(X_train_sparse_new, y_train)
test_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

In [87]:
X_train_sparse_new.shape

(253561, 48372)

In [88]:
test_pred.shape

(82797,)

In [89]:
pd.Series(test_pred, index=range(1,test_pred.shape[0]+1), name='target').to_csv('second_test.csv',header=True, index_label='session_id')

# Мои добавки

In [99]:
new_feat_train_day = pd.DataFrame(index=train_df.index)
new_train_test_day = pd.DataFrame(index=test_df.index)

In [100]:
new_feat_train_day['year_month_day'] = train_df['time1'].apply(lambda x: x.year*10000 + x.month*100 + x.day)
new_train_test_day['year_month_day'] = test_df['time1'].apply(lambda x: x.year*10000 + x.month*100 + x.day)

In [102]:
new_train_test_day.head()

Unnamed: 0_level_0,year_month_day
session_id,Unnamed: 1_level_1
1,20141004
2,20140703
3,20141205
4,20141104
5,20140516


In [103]:
scaler = StandardScaler()
scaler.fit(new_feat_train_day['year_month_day'].values.reshape(-1,1))
new_feat_train_day['year_month_scaled_day'] = scaler.transform(new_feat_train_day['year_month_day'].values.reshape(-1,1))
new_train_test_day['year_month_scaled_day'] = scaler.transform(new_train_test_day['year_month_day'].values.reshape(-1,1))

In [105]:
X_train_sparse_new_day = csr_matrix(hstack([X_train_sparse, new_feat_train_day.year_month_scaled_day.values.reshape(-1,1)]))
X_test_sparse_new_day = csr_matrix(hstack([X_test_sparse, new_train_test_day.year_month_scaled_day.values.reshape(-1,1)]))

In [106]:
get_auc_lr_valid(X_train_sparse_new_day, y_train)

0.9196413583892163

In [111]:
new_feat_train = new_feat_train.drop('year_month_day', axis=1)

In [112]:
new_train_test = new_train_test.drop('year_month_day', axis=1)

# 3

In [129]:
train_df['time1'].apply(lambda x: np.sign(x.hour-12))

session_id
21669    -1
54843    -1
77292    -1
114021   -1
146670   -1
         ..
12224     1
164438    1
12221     1
156968    1
204762    1
Name: time1, Length: 253561, dtype: int64

In [130]:
new_feat_train['start_time'] = train_df['time1'].apply(lambda x: np.sign(x.hour-12))
new_train_test['start_time'] = test_df['time1'].apply(lambda x: np.sign(x.hour-12))

In [131]:
new_train_test.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_time
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,201410,0.822948,-1
2,201407,0.752287,-1
3,201412,0.870055,1
4,201411,0.846501,-1
5,201405,0.705179,1


In [132]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, new_feat_train.start_time.values.reshape(-1,1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, new_train_test.start_time.values.reshape(-1,1)]))

In [133]:
X_train_sparse_new.shape

(253561, 48373)

In [157]:
get_auc_lr_valid(X_train_sparse_new, y_train, C=0.13)

0.9519600721678991

In [135]:
logit.fit(X_train_sparse_new, y_train)
test_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

In [136]:
test_pred.shape

(82797,)

In [137]:
pd.Series(test_pred, index=range(1,test_pred.shape[0]+1), name='target').to_csv('3_test.csv',header=True, index_label='session_id')

# 4

In [142]:
train_df['time1'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second)

session_id
21669     29157
54843     31043
77292     31813
114021    31817
146670    31820
          ...  
12224     84828
164438    84855
12221     85088
156968    85116
204762    85193
Name: time1, Length: 253561, dtype: int64

In [144]:
train_df['time10'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second) - train_df['time1'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second)

session_id
21669       NaN
54843       NaN
77292       4.0
114021      3.0
146670      2.0
          ...  
12224      12.0
164438    178.0
12221      28.0
156968     77.0
204762      NaN
Length: 253561, dtype: float64

In [145]:
train_df['time10'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second)

session_id
21669         NaN
54843         NaN
77292     31817.0
114021    31820.0
146670    31822.0
           ...   
12224     84840.0
164438    85033.0
12221     85116.0
156968    85193.0
204762        NaN
Name: time10, Length: 253561, dtype: float64

In [165]:
train_df.query('time3=="NaN" & target==1')

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
197753,775,2013-04-12 09:26:15,616,2013-04-12 09:26:16,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
161160,52,2013-09-12 18:20:54,1057,2013-09-12 18:20:54,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
118404,3000,2014-01-16 13:04:20,0,NaT,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
43722,1,2014-01-28 14:14:19,63,2014-01-28 14:17:03,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
184965,39,2014-02-24 18:14:19,21,2014-02-24 18:14:19,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
60640,3470,2014-02-27 15:16:40,0,NaT,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
116578,63,2014-03-02 11:55:52,0,NaT,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
195253,5580,2014-03-18 13:50:09,30,2014-03-18 13:51:08,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
131027,80,2014-03-24 18:19:24,77,2014-03-24 18:19:24,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1
9516,5301,2014-03-31 16:48:56,5301,2014-03-31 16:49:00,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,1


In [180]:
train_df.site1.apply(lambda x: 0 if x==0 else 1) + train_df.site2.apply(lambda x: 0 if x==0 else 1)

session_id
21669     2
54843     2
77292     2
114021    2
146670    2
         ..
12224     2
164438    2
12221     2
156968    2
204762    2
Length: 253561, dtype: int64

In [228]:
s_sum = 0
for i in range(1,11):
    s_sum += train_df['site%s' %i].apply(lambda x: 0 if x==0 else 1)

In [229]:
s_sum

session_id
21669      2
54843      4
77292     10
114021    10
146670    10
          ..
12224     10
164438    10
12221     10
156968    10
204762     7
Name: site1, Length: 253561, dtype: int64

In [354]:
s_sum.to_frame().values.reshape(-1,1)

253561

In [232]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, s_sum.to_frame().values.reshape(-1,1)]))

In [356]:
s_sum = 0
for i in range(1,11):
    s_sum += test_df['site%s' %i].apply(lambda x: 0 if x==0 else 1)
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, s_sum.to_frame().values.reshape(-1,1)]))

In [245]:
X_train_sparse_new.shape

(253561, 48375)

In [246]:
X_test_sparse_new.shape

(82797, 48375)

In [250]:
get_auc_lr_valid(X_train_sparse_new, y_train, C=1)

0.9508667906901479

In [251]:
logit.fit(X_train_sparse_new, y_train)
test_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

In [252]:
test_pred.shape

(82797,)

In [253]:
pd.Series(test_pred, index=range(1,test_pred.shape[0]+1), name='target').to_csv('4_test.csv',header=True, index_label='session_id')

# 5

In [365]:
s_sum1 = 0
for i in range(1,11):
    s_sum1 += train_df['site%s' %i].apply(lambda x: 0 if x==0 else 1)
s_sum1

session_id
21669      2
54843      4
77292     10
114021    10
146670    10
          ..
12224     10
164438    10
12221     10
156968    10
204762     7
Name: site1, Length: 253561, dtype: int64

In [370]:
len(s_sum1.tolist())

253561

In [371]:
ind = 0
arr_time_dif = []
for i in s_sum1.tolist():
    arr_time_dif.append(train_df['time%s' % i].iloc[ind].hour*3600 + train_df['time%s' % i].iloc[ind].minute*60 + train_df['time%s' % i].iloc[ind].second - \
    (train_df['time1'].iloc[ind].hour*3600 + train_df['time1'].iloc[ind].minute*60 + train_df['time1'].iloc[ind].second))
    ind += 1

In [374]:
len(arr_time_dif)

253561

In [375]:
arr_time_dif = pd.Series(arr_time_dif).to_frame().rename(columns={0:'dif_time'})

In [376]:
arr_time_dif

Unnamed: 0,dif_time
0,0
1,1786
2,4
3,3
4,2
...,...
253556,12
253557,178
253558,28
253559,77


In [339]:
s_sum2 = 0
for i in range(1,11):
    s_sum2 += test_df['site%s' %i].apply(lambda x: 0 if x==0 else 1)
ind = 0
arr_time_dif2 = []
for i in s_sum2.tolist():
    arr_time_dif2.append(test_df['time%s' % i].iloc[ind].hour*3600 + test_df['time%s' % i].iloc[ind].minute*60 + test_df['time%s' % i].iloc[ind].second - \
    (test_df['time1'].iloc[ind].hour*3600 + test_df['time1'].iloc[ind].minute*60 + test_df['time1'].iloc[ind].second))
    ind += 1
arr_time_dif2 = pd.Series(arr_time_dif2).to_frame().rename(columns={0:'dif_time'})

In [377]:
arr_time_dif2

Unnamed: 0,dif_time
0,-0.436056
1,-0.179700
2,-0.182986
3,-0.445915
4,-0.416336
...,...
82792,-0.459062
82793,-0.330884
82794,-0.432769
82795,-0.416336


In [378]:
len(arr_time_dif['dif_time'].values.reshape(-1,1))

253561

In [379]:
scaler.fit(arr_time_dif['dif_time'].values.reshape(-1,1))
arr_time_dif['dif_time'] = scaler.transform(arr_time_dif['dif_time'].values.reshape(-1,1))
arr_time_dif2['dif_time'] = scaler.transform(arr_time_dif2['dif_time'].values.reshape(-1,1))

In [381]:
arr_time_dif.shape

(253561, 1)

In [382]:
len(arr_time_dif.values.reshape(-1,1))

253561

In [383]:
len(arr_time_dif2.values.reshape(-1,1))

82797

In [384]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, arr_time_dif.values.reshape(-1,1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new,arr_time_dif2.values.reshape(-1,1)]))

In [385]:
get_auc_lr_valid(X_train_sparse_new, y_train, C=1)

0.9493769540944051

In [386]:
pd.Series(test_pred, index=range(1,test_pred.shape[0]+1), name='target').to_csv('5_test.csv',header=True, index_label='session_id')