In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn; seaborn.set()

In [None]:
data_train = pd.read_csv('../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv', index_col='session_id')
data_test = pd.read_csv('../input/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv', index_col='session_id')

In [None]:
print (data_train.shape)
print (data_test.shape)

In [None]:
site_id = ['site1', 'site2', 'site3', 'site4', 'site5', 'site6', 'site7', 'site8', 'site9', 'site10']
time_site = list(set(data_train.columns.values.tolist()) - set(site_id) - set(['target']))
print(time_site)

In [None]:
for time in time_site:
    data_test[time] = pd.to_datetime(data_test[time])
    data_train[time] = pd.to_datetime(data_train[time])

In [None]:
data_train['open_v']=10-np.sum(data_train[site_id].isnull(),axis=1)
data_test['open_v']=10-np.sum(data_test[site_id].isnull(),axis=1)

In [None]:
data_train = data_train.sort_values(by='time1')

In [None]:
data_train[site_id].fillna(0).to_csv('train_sessions_text.txt', sep=' ', index=None, header=None)
data_test[site_id].fillna(0).to_csv('test_sessions_text.txt', sep=' ', index=None, header=None)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train_transform = vectorizer.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_train_file:
    X_test_transform = vectorizer.transform(inp_train_file)    

In [None]:
print (X_train_transform.shape)
print (X_test_transform.shape)

In [None]:
from scipy.sparse import hstack
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [None]:
X_train = add_time_features(data_train.fillna(0), X_train_transform)
X_test = add_time_features(data_test.fillna(0), X_test_transform)

In [None]:
y = data_train['target']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit #для данных с временной привязкой
from sklearn.model_selection import cross_val_score
model = LogisticRegression(C=1, random_state=17, solver='liblinear')

cv = TimeSeriesSplit(n_splits=10)
cv_scores = cross_val_score(model, X_train, y.astype('int'), cv=cv, scoring='roc_auc', n_jobs=1)
cv_scores.mean()

In [None]:
model.fit(X_train,y)
y_prob = model.predict_proba(X_test)

In [None]:
y_prob[:,1]

In [None]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
write_to_submission_file(y_prob[:,1], 'out_file.csv', target='target', index_label="session_id")

In [None]:
out = pd.read_csv('out_file.csv')
out.head()

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 5], 'penalty': ['l1','l2']}
model = LogisticRegression(random_state=17, solver='liblinear')
grid_cv = GridSearchCV(model, param_grid, scoring='roc_auc', n_jobs=1, cv=cv, verbose=True)
grid_cv.fit(X_train,y)

In [None]:
print (grid_cv.best_estimator_) # лучший алгоритм
print (grid_cv.best_params_) # лучший набор параметров

In [None]:
y_prob_gs = grid_cv.predict_proba(X_test)

In [None]:
write_to_submission_file(y_prob_gs[:,1], 'out_file.csv', target='target', index_label="session_id")