In [1]:
import pandas as pd
import numpy as np
%matplotlib inline 

import datetime
from matplotlib import pyplot as plt
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


In [3]:
# Read the training and test data sets
test_df = pd.read_csv('/Users/vladimir/Desktop/kaggle/catch me if you can/test_sessions.csv',
                       index_col='session_id')
train_df = pd.read_csv('/Users/vladimir/Desktop/kaggle/catch me if you can/train_sessions.csv',
                      index_col='session_id')

In [4]:
train_df.sample(frac=1).reset_index(drop=True)
y = train_df['target']
ratio = 0.9
idx = int(round(train_df.shape[0] * ratio))
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=17)
train_split = train_df.shape[0]


X_Feat = pd.concat([train_df.drop(columns='target'), test_df], axis = 0)


# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]
# Sort the data by time
train_df = train_df.sort_values(by='time1')
train_df.head()


Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [5]:
train_df.shape, test_df.shape

((253561, 21), (82797, 20))

In [6]:
#feature engineering

def get_part_day(x):
    h = x.hour
    if 0 <= h <= 6:
        return 0
    if 7 <= h <= 11:
        return 1
    if 12 <= h <= 18:
        return 2
    if 19 <= h <= 23:
        return 3

    
X_Feat[times] = X_Feat[times].apply(pd.to_datetime)
X_Feat[sites] = X_Feat[sites].fillna(0).astype('int')

X_Feat['sites'] = X_Feat[sites].apply(lambda x: " ".join(x.astype('str')), axis = 1)
X_Feat['hour'] = X_Feat['time1'].apply(lambda x: x.hour)
X_Feat['month'] = X_Feat['time1'].apply(lambda x: x.month)
X_Feat['year'] = X_Feat['time1'].apply(lambda x: x.year)
X_Feat['yearmonth'] = 12*(X_Feat['year'] - 2013) + X_Feat['month']

X_Feat['most_active_hours1618'] = X_Feat['time1'].apply(lambda x: 16<=x.hour<=18).astype('int')
X_Feat['most_active_hours1213'] = X_Feat['time1'].apply(lambda x: 12<=x.hour<=13).astype('int')
X_Feat['most_active_hours915'] = X_Feat['time1'].apply(lambda x: (x.hour == 9 or x.hour == 15)).astype('int')

X_Feat['minute'] = X_Feat['time1'].apply(lambda x: x.minute)
X_Feat['len'] = (X_Feat['time10'] - X_Feat['time1']).apply(lambda x: np.log1p(x.total_seconds())).fillna(0)
X_Feat['weekday'] = X_Feat['time1'].apply(lambda x: x.weekday())
X_Feat['partday'] = X_Feat['time1'].apply(lambda x: get_part_day(x))
X_Feat['week'] = X_Feat['time1'].apply(lambda x: x.isocalendar()[1])
X_Feat['saturday'] = (X_Feat['weekday'] == 5).astype('int')
X_Feat['sunday'] = (X_Feat['weekday'] == 6).astype('int')



 





In [13]:
full_sites = X_Feat[sites]
sites_flatten = full_sites.values.flatten()
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0]  + 10, 10)))[:, 1:]


features_ohe = ['hour', 'weekday', 'partday', 'month', 'yearmonth',"week"]
ohe = OneHotEncoder().fit_transform(X_Feat[features_ohe])

train_df = X_Feat[:train_split]
test_df = X_Feat[train_split:]


tf = TfidfVectorizer(ngram_range=(1,7), max_features = 200000)
tf.fit(train_df['sites'].values)
tf_idf = tf.transform(X_Feat['sites'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [16]:
features = ['saturday', 'sunday', 'len','most_active_hours915', 'most_active_hours1618', 'most_active_hours1213' ]
full_df = hstack([tf_idf, X_Feat[features], ohe], format='csr')
X_train = full_df[:train_split]
X_test = full_df[train_split:]

#Fit and test predict
linear = LogisticRegression(C=5, solver='liblinear')
linear.fit(X_train, y)
test_pred = linear.predict_proba(X_test)[:, 1]
#Create submission file
write_to_submission_file(test_pred, "alice_submission_file.csv")
