In [2]:
import warnings
import pandas as pd
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv(
    'train_sessions.csv',
    index_col='session_id',
    parse_dates=['time1']
)
test_df = pd.read_csv(
    'test_sessions.csv',
    index_col='session_id',
    parse_dates=['time1']
)
# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

# Load websites dictionary
with open(r"site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(
    list(site_dict.keys()),
    index=list(site_dict.values()),
    columns=['site']
)
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [5]:
print(test_df.shape, train_df.shape)


(82797, 20) (253561, 21)


In [6]:
y_train = train_df['target'].values

# United dataframe of the initial data 
full_df = pd.concat(
    [train_df.drop('target', axis=1), test_df]
)
# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [7]:
train_df[sites].fillna(0).to_csv('train_sessions_text.txt', 
                                 sep=' ', index=None, header=None)
test_df[sites].fillna(0).to_csv('test_sessions_text.txt', 
                                sep=' ', index=None, header=None)

In [8]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

# CountVectorizer преобразовывает входной текст в матрицу,значениями которой,
# являются количества вхождения данного ключа(слова) в текст.

# раз два три
# три четыре два два
# раз раз раз четыре
# Для начала CountVectorizer собирает уникальные ключи из всех записей,
# в этом примере это будет [раз, два, три, четыре]
# Длина списка из уникальных ключей и будет длиной нашего закодированного 
# текста (в этом случае это 4). А номера элементов будут соответствовать, 
# количеству раз встречи данного ключа с данным номером в строке:
# раз два три --> [1,1,1,0]
# три четыре два два --> [0,2,1,1]
# раз раз раз четыре --> [3,0,0,1]


(253561, 50000) (82797, 50000)
CPU times: user 9.18 s, sys: 305 ms, total: 9.49 s
Wall time: 9.73 s


In [9]:
X_train.shape, y_train.shape

((253561, 50000), (253561,))

In [12]:

grid = {
    'C': 10**np.linspace(-3,1,5),
    'max_iter': [100, 200, 500],
    'solver': ['newton-cg', 'lbfgs', 'saga', 'liblinear']
}
idx = int(round(X_train.shape[0] * 0.8))
log_reg =  LogisticRegression(random_state=13).fit(X_train[:idx, :], y_train[:idx])
gs = GridSearchCV(log_reg, grid, scoring='roc_auc', cv=5, n_jobs=-1)
result = gs.fit(X_train[:idx, :], y_train[:idx])
print(gs.best_estimator_)
# %%time
# Split the data into the training and validation sets
# idx = int(round(X_train.shape[0] * 0.8))
# Classifier training
# lr = LogisticRegression(random_state=13).fit(X_train[:idx, :], y_train[:idx])
# Prediction for validation set
y_pred = result.predict_proba(X_train[idx:, :])[:, 1]
# Calculate the quality
score = roc_auc_score(y_train[idx:], y_pred)
print(score)
# 0.9132519600597074 C=1, solver='lbfgs', max_iter=500, random_state=17, 0.9 - coef to split data train & test
# 0.9266373497217151 C=0.1, solver='newton-cg', max_iter=5000, random_state=13, 0.8 - coef to split data train & test
# 0.9266372884686439 C=0.1, solver='lbfgs', max_iter=5000, random_state=13, 0.8 - coef to split data train & test
# 0.9272478590826543 C=0.1, solver='saga', max_iter=5000, random_state=13, 0.8 - coef to split data train & test
# 0.927651700581267 C=0.1, solver='liblinear', max_iter=100, random_state=13, 0.8 - coef to split data train & test

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=13, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.9161529677848047


In [None]:
def write_to_submission_file(
    predicted_labels, out_file,
    target='target', index_label="session_id"
):
    predicted_df = pd.DataFrame(
        predicted_labels,
        index = np.arange(1, predicted_labels.shape[0] + 1),
        columns=[target]
    )
    predicted_df.to_csv(out_file, index_label=index_label)
    

In [None]:
# Train the model on the whole training data set
# Use random_state=17 for reproducibility
# Parameter C=1 by default, but here we set it explicitly
lr = LogisticRegression(
    C=0.1, random_state=13, solver='lbfgs', max_iter=5000
).fit(X_train, y_train)
# Make a prediction for test data set
y_test = lr.predict_proba(X_test)[:, 1]
# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline.csv')