In [1]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from matplotlib import pyplot as plt
plt.style.use('ggplot')

import seaborn as sns
import os
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import pickle as pkl
from collections import defaultdict

from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

In [2]:
PATH_TO_DATA = ('./data')
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id')

In [3]:
site_dict = pkl.load(open('./data/site_dic.pkl', 'rb'))
site_dict_inv = defaultdict(str, [(v, k) for (k, v) in site_dict.items()])

In [7]:
sites = ['site%s' % i for i in np.arange(1, 11)]
train_df.fillna(0, inplace=True)
train_df[sites] = train_df[sites].astype(int)
train_df[sites]

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,718,0,0,0,0,0,0,0,0,0
2,890,941,3847,941,942,3846,3847,3846,1516,1518
3,14769,39,14768,14769,37,39,14768,14768,14768,14768
4,782,782,782,782,782,782,782,782,782,782
5,22,177,175,178,177,178,175,177,177,178
6,570,21,570,21,21,0,0,0,0,0
7,803,23,5956,17513,37,21,803,17514,17514,17514
8,22,21,29,5041,14422,23,21,5041,14421,14421
9,668,940,942,941,941,942,940,23,21,22
10,3700,229,570,21,229,21,21,21,2336,2044


In [34]:
site_name = [site + '_name' for site in sites]

def is_ip(ip):
    spl = ip.split('.')
    if len(spl) != 4:
        return False
    for s in spl:
        if not s.isdigit():
            return False
    return True

def convert(x):
    if is_ip(x):
        return x
    elif x != '':
        return x.split('.')[-2]
    else:
        return '-'

for site in tqdm_notebook(sites):
    train_df[site + '_name'] = train_df[site].apply(lambda x: site_dict_inv[x])
    test_df[site + '_name'] = test_df[site].apply(lambda x: site_dict_inv[x])




Exception in thread Thread-15:
Traceback (most recent call last):
  File "/home/voudy/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/voudy/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/voudy/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






KeyboardInterrupt: 

In [31]:
train_df[site_name]

Unnamed: 0_level_0,site1_name,site2_name,site3_name,site4_name,site5_name,site6_name,site7_name,site8_name,site9_name,site10_name
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,microsoft,-,-,-,-,-,-,-,-,-
2,google,google,google,google,google,google,google,google,193.164.197.30,193.164.196.60
3,googleapis,google,googleapis,googleapis,twitter,google,googleapis,googleapis,googleapis,googleapis
4,annotathon,annotathon,annotathon,annotathon,annotathon,annotathon,annotathon,annotathon,annotathon,annotathon
5,google,wikipedia,wikimedia,wikimedia,wikipedia,wikimedia,wikimedia,wikipedia,wikipedia,wikimedia
6,google,google,google,google,google,-,-,-,-,-
7,google,google,twitter,html5doctor,twitter,google,google,designvegetal,designvegetal,designvegetal
8,google,google,facebook,ztat,zalando,google,google,ztat,ztat,ztat
9,google,google,google,google,google,google,google,google,google,google
10,gc,google,google,google,google,google,google,google,ad6media,ad6media


In [26]:
def is_ip(ip):
    spl = ip.split('.')
    if len(spl) != 4:
        return False
    for s in spl:
        if not s.isdigit():
            return False
    return True

[i for i in site_dict.keys() if is_ip(i)]

['193.50.234.35',
 '83.206.13.2',
 '91.121.134.14',
 '37.59.49.196',
 '83.169.83.154',
 '162.38.181.25',
 '69.195.124.152',
 '210.156.129.164',
 '202.183.63.213',
 '176.58.127.102',
 '83.206.219.162',
 '198.251.79.228',
 '173.236.101.58',
 '128.101.105.82',
 '178.255.153.13',
 '115.30.191.21',
 '37.252.230.18',
 '87.98.128.200',
 '85.17.77.13',
 '5.9.139.3',
 '79.98.96.110',
 '193.164.196.40',
 '74.50.147.105',
 '217.146.14.4',
 '130.209.15.9',
 '203.112.63.12',
 '103.1.187.206',
 '202.183.54.12',
 '199.167.151.66',
 '46.28.49.180',
 '87.230.83.56',
 '37.59.38.219',
 '218.47.39.195',
 '78.24.130.6',
 '5.9.138.194',
 '5.135.140.211',
 '85.31.208.126',
 '178.79.162.68',
 '202.183.63.208',
 '50.18.191.15',
 '195.158.240.48',
 '66.70.56.43',
 '62.75.236.44',
 '193.164.196.60',
 '85.214.84.91',
 '193.164.197.30',
 '176.58.127.107',
 '206.128.127.131',
 '91.121.97.162',
 '94.23.236.222',
 '193.164.196.50',
 '37.252.230.26',
 '59.146.77.13',
 '50.18.122.28',
 '65.54.113.26',
 '88.159.162.66',