In [1]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# загрузил обучающую и тестовую выборки
train_df = pd.read_csv("data/train_sessions.csv", index_col="session_id")
test_df = pd.read_csv("data/test_sessions.csv", index_col="session_id")

times = ["time%s" % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)
train_df = train_df.sort_values(by="time1")
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [3]:
sites = ["site%s" % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype("int")
test_df[sites] = test_df[sites].fillna(0).astype("int")
#словарь сайтов
with open(r"data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)
# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=["site"])
print("всего сайтов:", sites_dict_df.shape[0])
sites_dict_df.head()

всего сайтов: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [4]:
#целевая переменная
y_train = train_df["target"]
# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop("target", axis=1), test_df])
# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [5]:
full_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,945,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,946,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,952,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22


In [6]:
# таблица с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [7]:
from scipy.sparse import csr_matrix

In [8]:
csr_matrix?

In [9]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

#разреженная матрица
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0], sites_flatten, range(0, sites_flatten.shape[0] + 10, 10),))[:, 1:]
X_train_sparse=full_sites_sparse[:idx_split]
X_test_sparse=full_sites_sparse[idx_split:]

In [10]:
sites_flatten.shape[0]

3363580

In [11]:
#построение первой модели (sites only)
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17)
    train_len=int(ratio*X.shape[0])
    X_train=X[:train_len,:]
    X_valid=X[train_len:,:]
    logit=LogisticRegression(n_jobs=-1, random_state=seed)
    y_train=y[:train_len]
    y_valid=y[train_len:]
    logit.fit(X_train, y_train)
    valid_preds=logit.predict_proba(X_valid)
    valid_pred=valid_preds[:,1]
    print(valid_pred)
    return roc_auc_score(y_valid, valid_pred)

In [12]:
%%time
get_auc_lr_valid(X_train_sparse, y_train)

[3.25266784e-06 1.63299065e-08 6.55966331e-07 ... 1.75678474e-05
 2.30341756e-05 9.72042358e-06]
Wall time: 5.18 s


0.919794802727792

In [13]:
#Функция для записи прогнозов в файл
def write_to_submission_file(
    predicted_labels, out_file, target="target", index_label="session_id"):
    predicted_df = pd.DataFrame(
        predicted_labels,
        index=np.arange(1, predicted_labels.shape[0] + 1),
        columns=[target],
    )
    predicted_df.to_csv(out_file, index_label=index_label)

In [14]:
#Обучение модели на всей выборке
%%time
logit=LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse, y_train)
test_pred=logit.predict_proba(X_test_sparse)[:,1]
pd.Series(test_pred,index=range(1,test_pred.shape[0]+1), name='target').to_csv('benchmark1.csv',header=True, index_label='session_id')

Wall time: 4.56 s


In [15]:
test_pred

array([2.21920500e-03, 2.51889508e-09, 6.15995124e-09, ...,
       8.42892042e-03, 3.87555278e-04, 1.29506948e-05])

In [16]:
#временные признаки типа ГГГГММ 
new_feat_train = pd.DataFrame(index=train_df.index)
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
new_feat_test = pd.DataFrame(index=test_df.index)
new_feat_test['year_month'] = test_df['time1'].dt.year * 100 + test_df['time1'].dt.month

In [17]:
X_train_sparce = full_sites_sparse[:idx_split]
X_test_sparce = full_sites_sparse[idx_split:]

In [18]:
#Отмасштабировал новые признаки
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1, 1))
new_feat_train['year_month_scaler'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1, 1))

scaler.fit(new_feat_test['year_month'].values.reshape(-1, 1))
new_feat_test['year_month_scaler'] = scaler.transform(new_feat_test['year_month'].values.reshape(-1, 1))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse, new_feat_train['year_month_scaler'].values.reshape(-1, 1)]))

In [19]:
#Построение второй модели (sites+time)
%%time
get_auc_lr_valid(X_train_sparse_new, y_train)

[3.28962996e-06 5.11925512e-08 7.10218021e-07 ... 7.05643474e-06
 1.05194182e-05 4.29727442e-06]
Wall time: 4.1 s


0.9198903563591923

In [20]:
logit.fit(X_train_sparse_new, y_train)

LogisticRegression(n_jobs=-1, random_state=17)

In [21]:
X_test_sparse_new = csr_matrix(hstack([X_test_sparce, new_feat_test['year_month_scaler'].values.reshape(-1, 1)]))
predict = logit.predict_proba(X_test_sparse_new)[:, 1]
write_to_submission_file(predict, 'benchmark2.csv')