In [5]:
%load_ext autoreload

import os
import sys
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

root_dir = os.path.abspath('..')
sys.path.append(os.path.join(root_dir, 'src/'))

import feature_preprocess

In [6]:
DATA_FOLDER = os.path.join(root_dir, 'data/')
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
SEGMENTS_FILE = 'Segments.xlsx'
RANDOM_STATE = 27

# Загрузка данных

In [7]:
segments = pd.read_excel(os.path.join(DATA_FOLDER, SEGMENTS_FILE))
display(segments)

Unnamed: 0,Номер сегмента,Возраст,Пол,Интересы
0,1,"25-34,35-41",Ж,-
1,2,"25-34,35-42",М,Пиво
2,3,"25-34,35-43",Ж,Дети
3,4,"18-24,25-34,35-44","М, Ж",Животные
4,5,"18-24,25-34,35-45","М, Ж",-


In [8]:
train_raw = pd.read_csv(os.path.join(DATA_FOLDER, TRAIN_FILE))
display(train_raw)

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,Чита,android,10.0
1,4,,,com.easybrain.solitaire.klondike.free,2021-07-10 10:38:42,MSK+2,Оренбургская область,Оренбург,Android,10.0.0
2,5,Games,Arcade,com.orbitalknight.ridiculousfreekick,2021-08-04 13:34:29,MSK,Санкт-Петербург,Санкт-Петербург,android,9.0
3,5,,,tcouchgind.scooterextreme.scooter,2021-08-06 07:35:27,MSK+2,Свердловская область,Екатеринбург,android,9
4,4,,,com.FidgetTrading3D.game,2021-08-02 20:43:59,MSK,Московская область,Звенигород,android,6.0.1
...,...,...,...,...,...,...,...,...,...,...
44854511,3,Games,Simulation,1068204657,2021-08-07 17:19:23,MSK,Краснодарский край,Краснодар,ios,14.4.2
44854512,3,Games,Puzzle,com.easybrain.nonogram.color,2021-08-02 09:17:16,MSK,Владимирская область,Владимир,android,11.0
44854513,5,Games,Arcade,com.nordcurrent.canteenhd,2021-09-16 09:26:38,MSK,Брянская область,Брянск,android,5.1
44854514,4,,,com.fugo.wow,2021-07-09 18:02:33,MSK,Татарстан,Казань,Android,7.1.2


In [9]:
test_raw = pd.read_csv(os.path.join(DATA_FOLDER, TEST_FILE))
display(test_raw)

Unnamed: 0,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,Applications,Shopping,com.allgoritm.youla,2021-09-19 17:31:33,MSK+2,Свердловская область,Екатеринбург,android,10.0
1,,,com.ChocochocoStd.RapBattle,2021-08-03 17:13:17,MSK+2,Ямало-Ненецкий АО,Новый Уренгой,android,10.0
2,,,1387897651,2021-09-17 15:54:00,MSK,Москва,Москва,ios,14.4.0
3,,,com.cooking.family.diary.fever.food.city.craze...,2021-07-05 23:34:59,MSK,Краснодарский край,Краснодар,android,9.0.0
4,Applications,Health & Fitness,com.pedometer.stepcounter.tracker,2021-07-08 15:15:21,MSK,Татарстан,Набережные Челны,android,10.0
...,...,...,...,...,...,...,...,...,...
11213624,,,com.phonemaster.jewelhunter,2021-08-04 05:55:26,MSK,Санкт-Петербург,Санкт-Петербург,android,8.1.0
11213625,,,com.linkdesks.jewellegend,2021-09-17 21:45:02,MSK,Ставрополье,Пятигорск,android,10
11213626,Applications,Health & Fitness,com.pedometer.stepcounter.tracker,2021-07-08 10:24:31,MSK+6,Забайкальский Край,Чита,android,11.0
11213627,,,com.onebutton.mrsuper,2021-07-05 16:02:47,MSK+2,Башкортостан,Белорецк,android,6.0.1


In [10]:
# set_train = {}
# set_test = {}
# absent = {}
# for ftr in ['gamecategory', 'subgamecategory', 'bundle', 'oblast', 'city', 'os', 'osv']:
#     set_train[ftr] = set(train_raw[ftr])
#     set_test[ftr] = set(test_raw[ftr])
#     print(ftr)
#     print('уникальных в трейне', len(set_train[ftr]))
#     print('уникальных в тесте', len(set_test[ftr]))
#     absent[ftr] = set_test[ftr] - set_train[ftr]
#     print('отсутствует в тесте', len(absent[ftr]))
#     print('% строк с отсутствующей фичей от всех строк в тесте', 100 * test_raw[test_raw[ftr].isin(absent[ftr])].shape[0] / test_raw.shape[0])

In [11]:
def prepare_X_y(data, target, prefix='Segment', drop_duplicates=False):
    data = data.copy()

    if drop_duplicates:
        data.drop_duplicates(inplace=True)
    
    targets_df = pd.get_dummies(data[target], prefix=prefix)

    X = data.drop(columns=target)
    y_list = []
    for col in targets_df.columns:
        y_list.append(targets_df[col])

    return X, y_list

In [12]:
def baseline_fit(X_train, y_train):
    baseline = {}
    for y in y_train:
        baseline[y.name] = \
            X_train.join(y)[['bundle', y.name]].groupby('bundle')[y.name].mean().reset_index()

    mean_prob = {}
    for y in y_train:
        mean_prob[y.name] = y.mean()
    return baseline, mean_prob

In [25]:
def baseline_predict_proba(X_test, target_names_list, baseline, mean_prob):
    y_test_prob = []
    for name in target_names_list:
        y_prob = X_test[['bundle']].merge(baseline[name], on='bundle', how='left')[name]
        y_prob = y_prob.fillna(mean_prob[name])
        y_test_prob.append(y_prob)
    return y_test_prob

In [14]:
def check_roc_auc(y_true, y_pred_proba):
    for true, pred_proba in zip(y_true, y_pred_proba):
        print(true.name)
        print(metrics.roc_auc_score(true, pred_proba))

# Подготовка датасета для бейзлайна

In [15]:
train,  valid_test = train_test_split(train_raw, test_size=0.3, random_state=RANDOM_STATE, stratify=train_raw['Segment'])
valid, test = train_test_split(valid_test, test_size=0.5, random_state=RANDOM_STATE, stratify=valid_test['Segment'])

In [16]:
X_train, y_train = prepare_X_y(train, 'Segment', prefix='Segment', drop_duplicates=True)
X_valid, y_valid = prepare_X_y(valid, 'Segment', prefix='Segment', drop_duplicates=False)
X_test, y_test = prepare_X_y(test, 'Segment', prefix='Segment', drop_duplicates=False)

In [17]:
baseline, mean_prob = baseline_fit(X_train, y_train)

In [21]:
target_names_list = [y.name for y in y_valid]

In [30]:
y_valid_prob = baseline_predict_proba(X_valid, target_names_list, baseline, mean_prob)

In [31]:
check_roc_auc(y_valid, y_valid_prob)

Segment_1
0.7131636279639575
Segment_2
0.7034592747880286
Segment_3
0.6541302581239069
Segment_4
0.7214437675553351
Segment_5
0.6462924904332414


In [33]:
y_test_prob = baseline_predict_proba(X_test, target_names_list, baseline, mean_prob)

In [34]:
check_roc_auc(y_test, y_test_prob)

Segment_1
0.713740497423714
Segment_2
0.7027878141352953
Segment_3
0.654253621954188
Segment_4
0.7214531187616308
Segment_5
0.6466655562708424
