<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Functions" data-toc-modified-id="Functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Functions</a></span></li><li><span><a href="#Read-data" data-toc-modified-id="Read-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read data</a></span></li><li><span><a href="#Preprocess" data-toc-modified-id="Preprocess-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Preprocess</a></span><ul class="toc-item"><li><span><a href="#Top-domains" data-toc-modified-id="Top-domains-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Top domains</a></span></li></ul></li><li><span><a href="#Try-Models" data-toc-modified-id="Try-Models-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Try Models</a></span><ul class="toc-item"><li><span><a href="#Cosine" data-toc-modified-id="Cosine-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Cosine</a></span></li><li><span><a href="#CatBoost" data-toc-modified-id="CatBoost-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>CatBoost</a></span></li></ul></li></ul></div>

## Imports

In [None]:
%pylab inline

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from catboost import CatBoostClassifier, Pool, cv

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
from datetime import datetime

import re
import os, sys
import json
import pickle

from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

from sklearn.feature_extraction import FeatureHasher
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.feature_selection import chi2

from collections import Counter
from itertools import chain, product
from nltk.stem.snowball import SnowballStemmer
import unidecode
from time import ctime

from sklearn.model_selection import StratifiedKFold

## Functions

In [None]:
tfidf_tokenizer = TfidfVectorizer().build_tokenizer()
stemmer = SnowballStemmer("russian") 
stemmer2 = SnowballStemmer("english") 

# Возвращает список словарей с url и timestamp из json "url_data"
def get_timestamp_and_url(url_data):
    return sorted(url_data["visits"], key=lambda x: x["timestamp"])

# Возвращает разброс по времени посещений (в милисекундах) из json "url_data"
def get_timestamp_range_from_url_data(url_data):
    mi = np.inf
    ma = -np.inf
    for record in url_data["visits"]:
        mi = min(mi, record["timestamp"])
        ma = max(ma, record["timestamp"])
    return ma - mi

# Преобразует строку с url в домен (с дополнительными символами b и ', например "b'domain.com'")
def url2domain(url):
    url = re.sub('(http(s)*://)+', 'http://', url)
    parsed_url = urlparse(unquote(url.strip()))
    if parsed_url.scheme not in ['http','https']: return None
    netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
    if containsAny(netloc, ":&@();= "): return None
    if netloc is not None: return str(netloc.encode('utf8')).strip()
    return None

def is_hex(s):
    return re.fullmatch(r"^[0-9a-fA-F]+$", s or "") is not None

def url2path_params_query(url):
    url = unidecode.unidecode(url)
    url = re.sub('(http(s)*://)+', 'http://', url)
    parsed_url = urlparse(unquote(url.strip()))
    if parsed_url.scheme not in ['http','https']: return None
    return parsed_url.path

def get_path_params_query(url_data, split_sessions=None, session_break=3600, mindf=0):
    domains = []
    domain_set = set()
    if split_sessions is True:
        sessions = []
    prev_ts = 0
    for record in sorted(url_data["visits"], key=lambda x: x["timestamp"]):
        if split_sessions is True and prev_ts != 0:
            if record["timestamp"] > prev_ts + 1000*session_break:
                sessions.append(" ".join(domains))
                domains = []
        
        prev_ts = record["timestamp"]
        domain = url2path_params_query(record["url"])
#         if domain not in domain_set:
#             domain_set.add(domain)
#         else:
#             continue
        domains.append(domain)
    ##
    splits = tfidf_tokenizer(" ".join(domains))
#     domains = {stemmer.stem(stemmer2.stem(i)) for i in splits if (len(i) > 3 and len(i) < 15 and str.isalpha(i))}
    domains = {i for i in splits if (wcnt[i] > mindf and len(i) > 3 and len(i) < 15 and str.isalpha(i))}
    ##
    if split_sessions is True:
        sessions.append(" ".join(domains))
        return sessions
    return " ".join(domains)

# Возращает строку с доменами, разделенными пробелами, из json "url_data" (без дополнительных символов b и ')
def get_domains(url_data, split_sessions=None, no_sequenses=False, session_break=3600, tf=False):
##############    
    hours = Counter()
    weekdays = Counter()
    weekends = Counter()
    week_in_months = Counter()
#     return pd.Series(np.hstack((hour_features, weekday_features, weekend_features, week_in_month_features)))
##############
    
    domains = []
    domain_set = set()
    if split_sessions is True:
        sessions = []
    prev_ts = 0
    prev_domain = None
    for record in sorted(url_data["visits"], key=lambda x: x["timestamp"]):
        if split_sessions is False and prev_ts != 0:
            if record["timestamp"] > prev_ts + 1000*session_break:
                prev_domain = None
        if split_sessions is True and prev_ts != 0:
            if record["timestamp"] > prev_ts + 1000*session_break:
                sessions.append(" ".join(domains))
                domains = []
                prev_domain = None
            
        prev_ts = record["timestamp"]
        tf_ = get_tf(prev_ts)
        domain = url2domain(record["url"])
        
        ###
        if tf:
            hours[tf_["hour"]] += 1
            weekdays[tf_["weekday"]] += 1
            weekends[tf_["weekend"]] += 1
            week_in_months[tf_["week_in_month"]] += 1
        ###
        
        if domain is None:
            continue
        domain = domain[2:-1]
        dcnt[domain] += 1
        if domain not in domain_set:
            ducnt[domain] += 1
            domain_set.add(domain)
        wcnt.update(domain.split("."))
        
        if no_sequenses and domain == prev_domain:
            continue        
        prev_domain = domain
        domains.append(domain)
#         ###
#         if tf:
#             hours[tf_["hour"]] += 1
#             weekdays[tf_["weekday"]] += 1
#             weekends[tf_["weekend"]] += 1
#             week_in_months[tf_["week_in_month"]] += 1
#         ###
    if split_sessions is True:
        sessions.append(" ".join(domains))
        return sessions
    if tf:
        hour_features = np.zeros(24, dtype=int)
        weekday_features = np.zeros(7, dtype=int)
        weekend_features = np.zeros(2, dtype=int)
        week_in_month_features = np.zeros(5, dtype=int)
        hour_features[list(hours.keys())] = list(hours.values())
        weekday_features[list(weekdays.keys())] = list(weekdays.values())
        weekend_features[list(weekends.keys())] = list(weekends.values())
        week_in_month_features[list(week_in_months.keys())] = list(week_in_months.values())
        
        hour_features = hour_features / sum(hour_features)
        weekday_features = weekday_features / sum(weekday_features)
        weekend_features = weekend_features / sum(weekend_features)
        week_in_month_features = week_in_month_features / sum(week_in_month_features)
        
        return pd.Series(np.hstack((" ".join(domains), 
                                    hour_features, 
                                    weekday_features,
                                    weekend_features, 
                                    week_in_month_features)))
    else:
        return pd.Series(" ".join(domains))

def get_tf(ts):
    dt = datetime.fromtimestamp(ts/1000)
    res = dict()
    res["day"] = dt.day
    res["week_in_month"] = (res["day"] - 1) // 7
    res["weekday"] = dt.weekday()
    res["weekend"] = 1 if res["weekday"] > 4 else 0
    res["hour"] = dt.hour
    return res

# Возвращает строку с доменами, разделенными пробелами, 
# полученную с помощью фильтрации аналогичной строки с фильтром top_domains
def get_filtered_domains(url_data, top_domains, no_repeatition=False):
    domains = []
    domain_set = set()
    for domain in url_data.split():
        if domain in top_domains:
            if not no_repeatition or domain not in domain_set:
                domains.append(domain)
                domain_set.add(domain)
    return " ".join(domains)

# Возращает строку с полными url'ами, разделенными пробелами, из json "url_data"
def get_urls(url_data):
    urls = []
    for record in url_data["visits"]:
        url = record["url"]
        urls.append(url)
    return " ".join(urls)

# Возвращает столбец с доменами, с выбрасыванием из исходного столбца редко встречающихся доменов
# domain_counts - DataFrame с информацией о посещаемости сайтов (см. Preprocess)
# min_df - все домены, которые встречаются меньше min_df раз, будут исключены
def get_top_domain_feature(df, domain_counts, min_df=0, no_repeatition=False):
    top_domains = set(domain_counts[domain_counts.counts>min_df].index.values)
    return df.domain.apply(get_filtered_domains, top_domains=top_domains, no_repeatition=no_repeatition)

gm = {"F":0,"M":1}
am = {"18-24":0, "25-34":1, "35-44":2, "45-54":3, ">=55":4}
def ga2cat(g, a):
    return gm[g]*5+am[a]

def cat2ga(c):
    gg = ["F", "M"]
    aa = ["18-24", "25-34", "35-44", "45-54", ">=55"]
    rg = gg[c//5]
    if c > 4:
        c -= 5
    ra = aa[c]
        
    return rg, ra

def containsAny(s, ch):
    """ Check whether sequence str contains ANY of the items in set. """
    return 1 in [c in s for c in ch]

def get_cosine_centr(vectorize_data, labels, features):
# get cosine centr for vectorize_data
# vectorize_data: pd.DataFrame
    cosine_centr = {}
    
    for label_ in labels:
        labels_val = vectorize_data[label_].unique()
        for label__ in labels_val:
            label__s = str(label__)
            data_ = vectorize_data[vectorize_data[label_] == label__]
            cosine_centr[label__s] = np.mean(data_[features]).values
            cosine_centr[label__s] /= sqrt(sum(np.multiply(cosine_centr[label__s], cosine_centr[label__s])))
#             if len(labels_val) > 2:
#                 data_ = vectorize_data[vectorize_data[label_] != label__]
#                 label__s = label__s+"_"
#                 cosine_centr[label__s] = np.mean(data_[features]).values
#                 cosine_centr[label__s] /= sqrt(sum(np.multiply(cosine_centr[label__s], cosine_centr[label__s])))
    return cosine_centr

def get_common_and_target(c1, c2):
    common = (c1 + c2) / 2
    common /= sqrt(sum(np.multiply(common, common)))
    c1_ = c1 - common
    c1_ /= sqrt(sum(np.multiply(c1_, c1_)))
    c2_ = c2 - common
    c2_ /= sqrt(sum(np.multiply(c2_, c2_)))
    return common, c1_, c2_

def get_common_and_target2(centers):
    common = np.mean(centers, axis=0)
    common /= sqrt(sum(np.multiply(common, common)))
    nc = [c - common for c in centers]
    nc = [c/sqrt(sum(np.multiply(c, c))) for c in nc]
    return common, nc

def fit_transform(df_mini, stop_words, domain_counts, min_df=10, no_repeatition=False,
              split_sessions=False, no_sequenses=False, session_break=60*60*1, tf=True, centers=0):
    tfidf_g = TfidfVectorizer(tokenizer=str.split, stop_words=stop_words)
    tfidf_a = TfidfVectorizer(tokenizer=str.split)

    time_features = []
    if tf:
        time_features =     ["h_"+str(i) for i in range(24)] + \
                            ["wd_"+str(i) for i in range(7)] + \
                            ["we_"+str(i) for i in range(2)] + \
                            ["wm_"+str(i) for i in range(5)]
    folds = 5
    gender_sim_cols = ["fgs_f_"+str(j) for j in range(folds)] +\
                      ["fgs_m_"+str(j) for j in range(folds)]
    age_sim_cols = ["fas_"+str(i)+"_"+str(j) for i in range(5) for j in range(folds)] 
    label_cols = ["gender", "age"]
    gender_label = "g"
    age_label = "a"
    cat_label = "c"

    d_tf = df_mini["user_json"].apply(json.loads).\
        apply(get_domains, split_sessions=split_sessions, no_sequenses=no_sequenses, session_break=session_break, tf=tf)
    if tf:
        cols = time_features
        d_tf.columns = ["domain"] + cols
    else:
        cols = []
        d_tf.columns = ["domain"]

    d_tf["top_domain"] = get_top_domain_feature(d_tf, domain_counts, min_df, no_repeatition=no_repeatition)

    d_tf = df_mini[label_cols].join(d_tf[cols + ["top_domain"]])
    d_tf.columns = label_cols + cols + ['domain']
    d_tf[gender_label] = (d_tf.gender=="M").astype(int)
    d_tf[age_label] = d_tf.age.map(am)
    d_tf[cat_label] = d_tf.apply(lambda x: ga2cat(x['gender'], x['age']), axis=1)

    features_g = tfidf_g.fit_transform(d_tf.domain)
    features_a = tfidf_a.fit_transform(d_tf.domain)
    tfidf_feats_g = ["tfidf_g_"+str(i) for i in range(features_g.shape[1])]
    tfidf_feats_a = ["tfidf_a_"+str(i) for i in range(features_a.shape[1])]

    d_tf = d_tf.join(
        pd.DataFrame(data=features_g.toarray(), 
                     columns=tfidf_feats_g, 
                     index=d_tf.index)
    ).join(
        pd.DataFrame(data=features_a.toarray(), 
                     columns=tfidf_feats_a, 
                     index=d_tf.index)
    )
    

    common_gender = [None]*folds
    f_ct = [None]*folds
    m_ct = [None]*folds
    common_age = [None]*folds
    a_ct = [None]*folds
    common_cat = [None]*folds
    c_ct = [None]*folds
    
    kf = StratifiedKFold(n_splits=folds, random_state=0, shuffle=True)
    for j, (train, test) in enumerate(kf.split(d_tf, d_tf.c)):
    
        if centers==0:
            subdf = d_tf.iloc[train]
        else:
            subdf = d_tf.iloc[test]
            
        centers_g = get_cosine_centr(subdf, ["gender"], tfidf_feats_g)
        centers_a = get_cosine_centr(subdf, ["age"], tfidf_feats_a)
        #
        centers_c = get_cosine_centr(subdf, ["c"], tfidf_feats_a)
        #

        common_gender[j], f_ct[j], m_ct[j] = get_common_and_target(centers_g['F'], centers_g['M'])
        common_age[j], a_ct[j] = get_common_and_target2([centers_a['18-24'],
                                                   centers_a['25-34'],
                                                   centers_a['35-44'],
                                                   centers_a['45-54'],
                                                   centers_a['>=55']])
        #
        common_cat[j], c_ct[j] = get_common_and_target2([centers_c[str(i)] for i in range(10)])
        #

        fg = d_tf[tfidf_feats_g] - common_gender[j]
        fg = fg / sqrt(np.multiply(fg, fg).sum(axis=0))
        fgs_f = cosine_similarity(f_ct[j].reshape(1, -1), fg)
        fgs_m = cosine_similarity(m_ct[j].reshape(1, -1), fg)

        fa = d_tf[tfidf_feats_a] - common_age[j]
        fa = fa / sqrt(np.multiply(fa, fa).sum(axis=0))
        n = 5
        fas = [None]*n
        for i in range(n):
            fas[i] = cosine_similarity(a_ct[j][i].reshape(1, -1), fa)

        #    
        fc = d_tf[tfidf_feats_a] - common_cat[j]
        fc = fc / sqrt(np.multiply(fc, fc).sum(axis=0))
        fcs = [None]*10
        for i in range(10):
            fcs[i] = cosine_similarity(c_ct[j][i].reshape(1, -1), fc)
        #

        d_tf["fgs_f"+"_"+str(j)] = fgs_f.T
        d_tf["fgs_m"+"_"+str(j)] = fgs_m.T
        for i in range(n):
            d_tf["fas_"+str(i)+"_"+str(j)] = fas[i].T
        #
        for i in range(10):
            d_tf["fcs_"+str(i)+"_"+str(j)] = fcs[i].T
        #
    
    gender_features = gender_sim_cols + tfidf_feats_g
    age_features = age_sim_cols + tfidf_feats_a + time_features
    cat_sim_cols = ["fcs_"+str(i)+"_"+str(j) for i in range(10) for j in range(folds)]
    
    return d_tf, gender_sim_cols, age_sim_cols, cat_sim_cols, time_features,\
                 tfidf_feats_g, tfidf_feats_a, tfidf_g, tfidf_a,\
                [common_gender, f_ct, m_ct], [common_age, a_ct], [common_cat, c_ct],\
                [label_cols, gender_label, age_label, cat_label]


def transform(df_mini, stop_words, domain_counts, tfidf_g, tfidf_a, gender_ctr, age_ctr, cat_ctr,
              min_df=10, no_repeatition=False,
              split_sessions=False, no_sequenses=False, session_break=60*60*1, tf=True):
    common_gender, f_ct, m_ct = gender_ctr
    common_age, a_ct = age_ctr
    common_cat, c_ct = cat_ctr
    
    time_features = []
    if tf:
        time_features =     ["h_"+str(i) for i in range(24)] + \
                            ["wd_"+str(i) for i in range(7)] + \
                            ["we_"+str(i) for i in range(2)] + \
                            ["wm_"+str(i) for i in range(5)]
        
    folds = 5
    gender_sim_cols = ["fgs_f", "fgs_m"]
    age_sim_cols = ["fas_"+str(i) for i in range(5)]
    cat_sim_cols = ["fcs_"+str(i)+"_"+str(j) for i in range(10) for j in range(folds)]

    d_tf = df_mini["user_json"].apply(json.loads).\
        apply(get_domains, split_sessions=split_sessions, no_sequenses=no_sequenses, session_break=session_break, tf=tf)
    if tf:
        cols = time_features
        d_tf.columns = ["domain"] + cols
    else:
        cols = []
        d_tf.columns = ["domain"]

    d_tf["top_domain"] = get_top_domain_feature(d_tf, domain_counts, min_df, no_repeatition=no_repeatition)

    d_tf = df_mini[label_cols].join(d_tf[cols + ["top_domain"]])
    d_tf.columns = label_cols + cols + ['domain']

    features_g = tfidf_g.transform(d_tf.domain)
    features_a = tfidf_a.transform(d_tf.domain)
    tfidf_feats_g = ["tfidf_g_"+str(i) for i in range(features_g.shape[1])]
    tfidf_feats_a = ["tfidf_a_"+str(i) for i in range(features_a.shape[1])]

    d_tf = d_tf.join(
        pd.DataFrame(data=features_g.toarray(), 
                     columns=tfidf_feats_g, 
                     index=d_tf.index)
    ).join(
        pd.DataFrame(data=features_a.toarray(), 
                     columns=tfidf_feats_a, 
                     index=d_tf.index)
    )

    for j in range(5):
        fg = d_tf[tfidf_feats_g] - common_gender[j]
        fg = fg / sqrt(np.multiply(fg, fg).sum(axis=0))
        fgs_f = cosine_similarity(f_ct[j].reshape(1, -1), fg)
        fgs_m = cosine_similarity(m_ct[j].reshape(1, -1), fg)

        fa = d_tf[tfidf_feats_a] - common_age[j]
        fa = fa / sqrt(np.multiply(fa, fa).sum(axis=0))
        n = 5
        fas = [None]*n
        for i in range(n):
            fas[i] = cosine_similarity(a_ct[j][i].reshape(1, -1), fa)

        #    
        fc = d_tf[tfidf_feats_a] - common_cat[j]
        fc = fc / sqrt(np.multiply(fc, fc).sum(axis=0))
        fcs = [None]*10
        for i in range(10):
            fcs[i] = cosine_similarity(c_ct[j][i].reshape(1, -1), fc)
        #

        d_tf["fgs_f"+"_"+str(j)] = fgs_f.T
        d_tf["fgs_m"+"_"+str(j)] = fgs_m.T
        for i in range(n):
            d_tf["fas_"+str(i)+"_"+str(j)] = fas[i].T
        #
        for i in range(10):
            d_tf["fcs_"+str(i)+"_"+str(j)] = fcs[i].T
        #
    
#     gender_features = gender_sim_cols + tfidf_feats_g
#     age_features = age_sim_cols + tfidf_feats_a + time_features
    
    return d_tf#, gender_features, age_features

In [None]:
def get_time(url_data):
# get features: how many times for that hour, weekday, weekend, ... user visited urls
    
    hour_features = np.zeros(24, dtype=int)
    weekday_features = np.zeros(7, dtype=int)
    weekend_features = np.zeros(2, dtype=int)
    week_in_month_features = np.zeros(5, dtype=int)
    
    data_ = pd.DataFrame(url_data["visits"]).copy()
    data_["timestamp"] = data_["timestamp"] / 1000
    data_["timestamp"] = data_.apply(lambda x: datetime.fromtimestamp(x["timestamp"] ), axis=1)
    data_["day"] = data_.apply(lambda x: x["timestamp"].day, axis=1)
    data_["week_in_month"] = (data_.day - 1) // 7 
    data_["weekday"] = data_.apply(lambda x: x["timestamp"].weekday(), axis=1)
    data_["weekend"] = data_.apply(lambda x: 1 if x.weekday > 4 else 0, axis=1)
    data_["hour"] = data_.apply(lambda x: x["timestamp"].hour, axis=1)    
    
    hours = Counter(data_.hour)
    hour_features[list(hours.keys())] = list(hours.values())
    
    weekdays = Counter(data_.weekday)
    weekday_features[list(weekdays.keys())] = list(weekdays.values())
    
    weekends = Counter(data_.weekend)
    weekend_features[list(weekends.keys())] = list(weekends.values())
    
    week_in_months = Counter(data_.week_in_month)
    week_in_month_features[list(week_in_months.keys())] = list(week_in_months.values())
    
    return pd.Series(np.hstack((hour_features, weekday_features, weekend_features, week_in_month_features)))

# test_dd =  pd.DataFrame(df[ : 2]["user_json"].apply(json.loads)).copy()
# test_dd["user_json"].apply(get_time)

In [None]:
# example = {"visits": [{'url': 'http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun', 'timestamp': 1419688144068}, {'url': 'http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story', 'timestamp': 1426666298001}, {'url': 'http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html', 'timestamp': 1426666298000}, {'url': 'http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story', 'timestamp': 1426661722001}, {'url': 'http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html', 'timestamp': 1426661722000}]}
# print('"'+get_domains(example)+'"')
# print('"'+get_urls(example)+'"')

## Read data

In [None]:
file_path = '/data/share/project01/gender_age_dataset.txt'
df = pd.read_csv(file_path, sep='\t')
df.shape
is_processed = False

## Preprocess

In [None]:
# Считает число встречающихся доменов, учитывает домен несколько раз, если встретился несколько раз у одного пользователя
dcnt = Counter()   
# Считает число встречающихся доменов, учитывает домен один раз, если встретился несколько раз у одного пользователя
ducnt = Counter()  
# Считает число встречающихся "слов"
wcnt = Counter() 

# Достаем домены из json-строки. Как побочный эффект функция get_domains наполняет счетчики
df["domain"] = df["user_json"].\
        apply(json.loads).\
        apply(get_domains) # vw.txt
#         apply(get_domains, split_sessions=False, no_sequenses=True) # vw2.txt

# Топ доменов
domain_counts = pd.DataFrame(ducnt.most_common()).set_index(0)
domain_counts.index.name = "domain"
domain_counts.columns = ["counts"]
# domain_counts.head()

is_processed = True

In [None]:
df["words"] = df["user_json"].\
        apply(json.loads).\
        apply(get_path_params_query)

In [None]:
known = df[~((df.gender == '-') & (df.age == '-'))]

In [None]:
known["age_"] = known.age.map(am)

In [None]:
known["domain"] = known[["domain", "words"]].apply(lambda x: " ".join(x), axis=1)

In [None]:
# # save
# if is_processed:
df.to_pickle("data/df_new.pkl")
#     domain_counts.to_pickle("data/dc.pkl")
# with open("dcnt.pkl", "wb") as dcnt_pkl:
#     pickle.dump([dcnt, ducnt, wcnt], dcnt_pkl)

In [None]:
# # save
# if is_processed:
#     df.to_pickle("data/df.pkl")
#     domain_counts.to_pickle("data/dc.pkl")
# with open("dcnt.pkl", "wb") as dcnt_pkl:
#     pickle.dump([dcnt, ducnt, wcnt], dcnt_pkl)

In [None]:
# # load
# df = pd.read_pickle("data/df.pkl")
# domain_counts = pd.read_pickle("data/dc.pkl")
# with open("dcnt.pkl", "rb") as dcnt_pkl:
#     dcnt, ducnt, wcnt = pickle.load(dcnt_pkl)

### Top domains

In [None]:
# Столбец с выброшенными редкими доменами
df["top_domain"] = get_top_domain_feature(df, domain_counts, 10, no_repeatition=True)

In [None]:
df.top_domain.head()

In [None]:
df.domain.head()

In [None]:
time_range = df["user_json"].\
        apply(json.loads).\
        apply(get_timestamp_range_from_url_data)

In [None]:
# Значения в днях
(time_range/1000/60/60/24).describe()

In [None]:
# Число посещений (всех)
visits = df.domain.apply(lambda x: len(str.split(x)))
visits.describe()

In [None]:
# Число посещений (без редких 10, без повторных заходов)
top_visits = df.top_domain.apply(lambda x: len(str.split(x)))
top_visits.describe()

## Try Models

### Cosine

In [None]:
# load
df = pd.read_pickle("data/df.pkl")
domain_counts = pd.read_pickle("data/dc.pkl")
with open("dcnt.pkl", "rb") as dcnt_pkl:
    dcnt, ducnt, wcnt = pickle.load(dcnt_pkl)

In [None]:
stop_words = ['cache.betweendigital.com', 'google.am', 'google.az', 'google.by', 'google.de', 'google.fi', 'google.ge', 'google.it', 'google.kz', 'google.lv', 'google.md', 'google.ru', 'google.com', 'google.co.il', 'google.com.ua', 'googleadservices.com', 'tpc.googlesyndication.com', 'pagead2.googlesyndication.com', 'webcache.googleusercontent.com', 'translate.googleusercontent.com', 'rambler.ru', 'id.rambler.ru', 'help.rambler.ru', 'mail.rambler.ru', 'vk.com', 'm.vk.com', 'api.vk.com', 'playvk.com', 'kinogo.net', 'kinogo-net.ru', 'ozon.ru', 'adme.ru', 'ololo.fm', 'hhcdn.ru', 'gismeteo.by', 'gismeteo.md', 'gismeteo.ru', 'gismeteo.ua', 'beta.gismeteo.ru', 'weather.com', 'accuweather.com', 'world-weather.ru', 'm.accuweather.com', 'weather.rambler.ru', 'weatherandtime.net', 'dic.academic.ru', 'films.imhonet.ru', 'rzd.ru', 'doc.rzd.ru', 'kbsh.rzd.ru', 'pass.rzd.ru', 'cargo.rzd.ru', 'press.rzd.ru', 'young.rzd.ru', 'social.rzd.ru', 'contacts.rzd.ru', 'fb.ru', 'm.fb.ru', 'facebook.com', 'l.facebook.com', 'm.facebook.com', 'lm.facebook.com', 'app.facebook.com', 'apps.facebook.com', 'litres.ru', 'muzofon.com', 'ok.ru', 'lk.ssl.mts.ru', 'pay.mts.ru', 'spb.mts.ru', 'shop.mts.ru', 'bonus.mts.ru', 'login.mts.ru', 'oauth.mts.ru', 'goodok.mts.ru', 'legacy.mts.ru', 'lk.ssl.mts.ru', 'ihelper.mts.ru', 'kabinet.mts.ru', 'ihelper.nw.mts.ru', 'ihelper.sib.mts.ru', 'ihelper.nnov.mts.ru', 'irr.ru', 'm.irr.ru', 'ufa.irr.ru', 'omsk.irr.ru', 'orel.irr.ru', 'perm.irr.ru', 'tula.irr.ru', 'tver.irr.ru', 'kazan.irr.ru', 'kirov.irr.ru', 'sochi.irr.ru', 'tomsk.irr.ru', 'russia.irr.ru', 'ryazan.irr.ru', 'samara.irr.ru', 'tambov.irr.ru', 'tyumen.irr.ru', 'barnaul.irr.ru', 'bryansk.irr.ru', 'irkutsk.irr.ru', 'ivanovo.irr.ru', 'izhevsk.irr.ru', 'lipetsk.irr.ru', 'obninsk.irr.ru', 'saratov.irr.ru', 'belgorod.irr.ru', 'kostroma.irr.ru', 'orenburg.irr.ru', 'tolyatti.irr.ru', 'vladimir.irr.ru', 'voronezh.irr.ru', 'krasnodar.irr.ru', 'nabchelny.irr.ru', 'ulyanovsk.irr.ru', 'volgograd.irr.ru', 'yaroslavl.irr.ru', 'khabarovsk.irr.ru', 'chelyabinsk.irr.ru', 'kaliningrad.irr.ru', 'krasnoyarsk.irr.ru', 'novosibirsk.irr.ru', 'velnovgorod.irr.ru', 'ekaterinburg.irr.ru', 'magnitogorsk.irr.ru', 'novokuznetsk.irr.ru', 'petrozavodsk.irr.ru', 'rostovnadonu.irr.ru', 'tulskaya-obl.irr.ru', 'nizhniynovgorod.irr.ru', 'saint-petersburg.irr.ru', 'vladimirskaya-obl.irr.ru', 'krasnodarskiy-kray.irr.ru', 'nizhegorodskaya-obl.irr.ru', '24open.ru', 'adme.ru', 'bigcinema.tv', 'bolshoyvopros.ru', 'cache.betweendigital.com', 'dic.academic.ru', 'enter.ru', 'fast-torrent.ru', 'films.imhonet.ru', 'gismeteo.ru', 'beta.gismeteo.ru', 'go.mail.ru', 'google.com.ua', 'google.ru', 'hhcdn.ru', 'kinogo.net', 'kinogo-net.ru', 'mail.rambler.ru', 'muzofon.com', 'ololo.fm', 'ozon.ru', 'pass.rzd.ru', 'rambler.ru', 'id.rambler.ru', 'help.rambler.ru', 'mail.rambler.ru', 'seasonvar.ru', 'vk.com', 'm.vk.com', 'api.vk.com', 'playvk.com', 'wow-impulse.ru', 'sp.wow-impulse.ru', 'yandex.ru', 'an.yandex.ru', 'tv.yandex.ru', 'yandex.ua', 'docs.google.com', 'news.google.com.ua', 'nova.rambler.ru', 'weather.rambler.ru', 'reebok.ru', '24medok.ru', 'clck.yandex.ru', 'mail.yandex.ru', 'maps.yandex.ru', 'music.yandex.ru', 'pogoda.yandex.ru', 'rabota.yandex.ru','news.google.com.ua', 'nova.rambler.ru', 'maps.yandex.ru', '\\xd1\\x81\\xd0\\xbe\\xd1\\x84\\xd1\\x82-\\xd0\\xb2\\xd0\\xb0\\xd1\\x80\\xd0\\xb5\\xd0\\xb7.\\xd1\\x80\\xd1\\x84', '\\xd0\\xb0\\xd0\\xbd\\xd0\\xb0\\xd0\\xbb\\xd0\\xbe\\xd0\\xb3\\xd0\\xb8-\\xd0\\xb4\\xd0\\xbe\\xd1\\x80\\xd0\\xbe\\xd0\\xb3\\xd0\\xb8\\xd1\\x85-\\xd0\\xbb\\xd0\\xb5\\xd0\\xba\\xd0\\xb0\\xd1\\x80\\xd1\\x81\\xd1\\x82\\xd0\\xb2.\\xd1\\x80\\xd1\\x84']

In [None]:
df_mini = df.copy()
df_mini = df_mini[~((df_mini.gender == '-') & (df_mini.age == '-'))]

d_tf, gender_sim_cols, age_sim_cols, cat_sim_cols, time_features,\
                 tfidf_feats_g, tfidf_feats_a, tfidf_g, tfidf_a,\
                [common_gender, f_ct, m_ct], [common_age, a_ct], [common_cat, c_ct],\
                [label_cols, gender_label, age_label, cat_label] =\
\
    fit_transform(df_mini, stop_words, domain_counts, no_repeatition=True, no_sequenses=True, tf=True, centers=1)

In [None]:
pickle.dump((d_tf, gender_sim_cols, age_sim_cols, cat_sim_cols, time_features,\
                 tfidf_feats_g, tfidf_feats_a, tfidf_g, tfidf_a,\
                [common_gender, f_ct, m_ct], [common_age, a_ct], [common_cat, c_ct],\
                [label_cols, gender_label, age_label, cat_label]), open("new_tr8.pkl", 'wb'), protocol=4)
!rm new_tr8.zip
!zip new_tr8.zip new_tr8.pkl

In [None]:
pickle.dump((gender_sim_cols, age_sim_cols, cat_sim_cols, time_features,\
                 tfidf_feats_g, tfidf_feats_a, tfidf_g, tfidf_a,\
                [common_gender, f_ct, m_ct], [common_age, a_ct], [common_cat, c_ct],\
                [label_cols, gender_label, age_label, cat_label]), open("proj8.pkl", 'wb'))

In [None]:
# with open("proj.pkl", "rb") as f:
#     gender_sim_cols, age_sim_cols, cat_sim_cols, time_features, \
#     tfidf_feats_g, tfidf_feats_a, tfidf_g, tfidf_a, \
#     gender_ctr, age_ctr, cat_ctr, \
#     [label_cols, gender_label, age_label, cat_label] = pickle.load(f)

In [None]:
# len(tfidf_feats_g), len(tfidf_feats_a)

In [None]:
gender_ctr = [common_gender, f_ct, m_ct]
age_ctr = [common_age, a_ct]
cat_ctr = [common_cat, c_ct]

In [None]:
ddd = df_mini.copy()
ddd = transform(ddd, stop_words, domain_counts, tfidf_g, tfidf_a, gender_ctr, age_ctr, cat_ctr)

In [None]:
j = 0
pylab.step(range(gender_ctr[1][j].shape[0]), gender_ctr[1][j])
pylab.step(range(gender_ctr[2][j].shape[0]), gender_ctr[2][j])

In [None]:
sn.distplot(ddd["fgs_f_0"], hist=False, label="f")
sn.distplot(ddd["fgs_m_0"], hist=False, label="m")
pylab.legend()

In [None]:
sn.distplot(ddd["fgs_f_0"][d_tf[gender_label]==0], hist=False, label="f")
sn.distplot(ddd["fgs_f_0"][d_tf[gender_label]==1], hist=False, label="m")
pylab.legend()

In [None]:
sn.distplot(ddd["fgs_m_0"][d_tf[gender_label]==0], hist=False, label="f")
sn.distplot(ddd["fgs_m_0"][d_tf[gender_label]==1], hist=False, label="m")
pylab.legend()

### CatBoost

In [None]:
model = CatBoostClassifier(
    loss_function='MultiClass',
#     loss_function='Logloss',
    eval_metric='Accuracy',
    random_seed=42,
    logging_level='Silent',
    learning_rate=0.05,
    rsm=0.6,
    subsample=0.6,
    max_depth=7,
    n_estimators=100,
    bootstrap_type='Bernoulli',
    
    od_pval=1,
    od_wait=100
)

In [None]:
model.fit(
    X_train_[cols], y_cat_train,
    eval_set=(X_test_[cols], y_cat_test),
    logging_level='Verbose',  # you can uncomment this for text output
#     plot=True
);

In [None]:
preds = model.predict_proba(X_test_[cols])
test_preds_proba = preds.max(axis=1)
test_preds = preds.argmax(axis=1)

In [None]:
preds = model.predict_proba(X_train_[cols])
train_preds_proba = preds.max(axis=1)
train_preds = preds.argmax(axis=1)

In [None]:
train_good = train_preds_proba.argsort()[::-1][:train_preds_proba.shape[0]//2+1]
test_good = test_preds_proba.argsort()[::-1][:test_preds_proba.shape[0]//2+1]

In [None]:
sum((test_preds == y_cat_test).iloc[test_good]) / test_good.shape[0], test_good.shape[0] / y_cat_test.shape[0]

In [None]:
sum((train_preds == y_cat_train).iloc[train_good]) / train_good.shape[0], train_good.shape[0] / y_cat_train.shape[0]

In [None]:
model.get_feature_importance(prettified=True) 