In [1]:
%load_ext autoreload

import os
import sys
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_object_dtype

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

root_dir = os.path.abspath('..')
sys.path.append(os.path.join(root_dir, 'src/'))

import feature_preprocess

In [2]:
DATA_FOLDER = os.path.join(root_dir, 'data/')
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
SEGMENTS_FILE = 'Segments.xlsx'

In [3]:
def calc_nan_percent(df):
    return round(100 * df.isna().sum() / df.shape[0], 2)

# Загрузка данных

In [4]:
segments = pd.read_excel(os.path.join(DATA_FOLDER, SEGMENTS_FILE))
display(segments)

Unnamed: 0,Номер сегмента,Возраст,Пол,Интересы
0,1,"25-34,35-41",Ж,-
1,2,"25-34,35-42",М,Пиво
2,3,"25-34,35-43",Ж,Дети
3,4,"18-24,25-34,35-44","М, Ж",Животные
4,5,"18-24,25-34,35-45","М, Ж",-


In [5]:
train = pd.read_csv(os.path.join(DATA_FOLDER, TRAIN_FILE))
display(train)

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,Чита,android,10.0
1,4,,,com.easybrain.solitaire.klondike.free,2021-07-10 10:38:42,MSK+2,Оренбургская область,Оренбург,Android,10.0.0
2,5,Games,Arcade,com.orbitalknight.ridiculousfreekick,2021-08-04 13:34:29,MSK,Санкт-Петербург,Санкт-Петербург,android,9.0
3,5,,,tcouchgind.scooterextreme.scooter,2021-08-06 07:35:27,MSK+2,Свердловская область,Екатеринбург,android,9
4,4,,,com.FidgetTrading3D.game,2021-08-02 20:43:59,MSK,Московская область,Звенигород,android,6.0.1
...,...,...,...,...,...,...,...,...,...,...
44854511,3,Games,Simulation,1068204657,2021-08-07 17:19:23,MSK,Краснодарский край,Краснодар,ios,14.4.2
44854512,3,Games,Puzzle,com.easybrain.nonogram.color,2021-08-02 09:17:16,MSK,Владимирская область,Владимир,android,11.0
44854513,5,Games,Arcade,com.nordcurrent.canteenhd,2021-09-16 09:26:38,MSK,Брянская область,Брянск,android,5.1
44854514,4,,,com.fugo.wow,2021-07-09 18:02:33,MSK,Татарстан,Казань,Android,7.1.2


In [6]:
test = pd.read_csv(os.path.join(DATA_FOLDER, TEST_FILE))
display(test)

Unnamed: 0,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,Applications,Shopping,com.allgoritm.youla,2021-09-19 17:31:33,MSK+2,Свердловская область,Екатеринбург,android,10.0
1,,,com.ChocochocoStd.RapBattle,2021-08-03 17:13:17,MSK+2,Ямало-Ненецкий АО,Новый Уренгой,android,10.0
2,,,1387897651,2021-09-17 15:54:00,MSK,Москва,Москва,ios,14.4.0
3,,,com.cooking.family.diary.fever.food.city.craze...,2021-07-05 23:34:59,MSK,Краснодарский край,Краснодар,android,9.0.0
4,Applications,Health & Fitness,com.pedometer.stepcounter.tracker,2021-07-08 15:15:21,MSK,Татарстан,Набережные Челны,android,10.0
...,...,...,...,...,...,...,...,...,...
11213624,,,com.phonemaster.jewelhunter,2021-08-04 05:55:26,MSK,Санкт-Петербург,Санкт-Петербург,android,8.1.0
11213625,,,com.linkdesks.jewellegend,2021-09-17 21:45:02,MSK,Ставрополье,Пятигорск,android,10
11213626,Applications,Health & Fitness,com.pedometer.stepcounter.tracker,2021-07-08 10:24:31,MSK+6,Забайкальский Край,Чита,android,11.0
11213627,,,com.onebutton.mrsuper,2021-07-05 16:02:47,MSK+2,Башкортостан,Белорецк,android,6.0.1


# Осмотр данных

In [7]:
display(train.info())
display(calc_nan_percent(train))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44854516 entries, 0 to 44854515
Data columns (total 10 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Segment          int64 
 1   gamecategory     object
 2   subgamecategory  object
 3   bundle           object
 4   created          object
 5   shift            object
 6   oblast           object
 7   city             object
 8   os               object
 9   osv              object
dtypes: int64(1), object(9)
memory usage: 3.3+ GB


None

Segment             0.00
gamecategory       37.81
subgamecategory    37.83
bundle              0.04
created             0.00
shift               8.01
oblast              7.70
city               10.70
os                  0.00
osv                 0.00
dtype: float64

In [8]:
display(test.info())
display(calc_nan_percent(test))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11213629 entries, 0 to 11213628
Data columns (total 9 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   gamecategory     object
 1   subgamecategory  object
 2   bundle           object
 3   created          object
 4   shift            object
 5   oblast           object
 6   city             object
 7   os               object
 8   osv              object
dtypes: object(9)
memory usage: 770.0+ MB


None

gamecategory       37.82
subgamecategory    37.83
bundle              0.04
created             0.00
shift               8.00
oblast              7.70
city               10.71
os                  0.00
osv                 0.00
dtype: float64

In [9]:
display(train['shift'].unique())
display(test['shift'].unique())

array(['MSK+6', 'MSK+2', 'MSK', 'MSK+3', nan, 'MSK+1', 'MSK+4', 'MSK-1',
       'MSK+5', 'MSK+7', 'MSK+9', 'MSK+8'], dtype=object)

array(['MSK+2', 'MSK', 'MSK+1', 'MSK+6', 'MSK+3', 'MSK+4', 'MSK-1', nan,
       'MSK+5', 'MSK+7', 'MSK+9', 'MSK+8'], dtype=object)

In [10]:
cities_train = train['city'].str.upper().unique()
cities_test = test['city'].str.upper().unique()
print(len(cities_train))
print(len(cities_test))

2499
2466


In [11]:
print(100 * train[train['shift'].isnull() & train['city'].notna()].shape[0] / train['shift'].isnull().sum())

3.783467691408045


In [12]:
print(100 * train[train['shift'].isnull() & train['oblast'].notna()].shape[0] / train['shift'].isnull().sum())

3.7835233838742464


In [13]:
print(100 * train[train['city'].isnull() & train['oblast'].notna()].shape[0] / train['city'].isnull().sum())

28.014921691536152


In [14]:
print(100 * train[train['oblast'].isnull() & train['city'].notna()].shape[0] / train['oblast'].isnull().sum())

0.0


In [15]:
100 * train['city'].str.upper().value_counts() / train['city'].str.upper().value_counts().sum()

МОСКВА             19.085050
САНКТ-ПЕТЕРБУРГ     7.593280
ЕКАТЕРИНБУРГ        6.722314
КРАСНОДАР           5.728177
КАЗАНЬ              4.721577
                     ...    
ОСОВЦЫ              0.000002
ДУБОВИКИ            0.000002
ЛОГОЙСК             0.000002
НОВОГРУДОК          0.000002
УРДОМА              0.000002
Name: city, Length: 2498, dtype: float64

In [16]:
100* test['city'].str.upper().value_counts() / test['city'].str.upper().value_counts().sum()

МОСКВА             19.090384
САНКТ-ПЕТЕРБУРГ     7.590310
ЕКАТЕРИНБУРГ        6.730807
КРАСНОДАР           5.721603
КАЗАНЬ              4.720040
                     ...    
ИЗБОРСК             0.000010
ЛОКТИОНОВО          0.000010
АГАПОВКА            0.000010
ПЕТРЯЕВКА           0.000010
ОНОХИНО             0.000010
Name: city, Length: 2465, dtype: float64

# Функция

In [17]:
df = pd.DataFrame(
    data=[
        ['2021-09-01 04:54:00', 'MSK'],
        ['2021-09-26 04:54:00', 'MSK'],
        ['2021-09-17 04:54:00', 'MSK'],
        ['2021-09-17 05:54:00', 'MSK'],
        ['2021-09-17 08:54:00', 'MSK'],
        ['2021-09-17 11:54:00', 'MSK'],
        ['2021-09-17 18:54:00', 'MSK'],
        ['2021-09-17 21:54:00', 'MSK'],
        ['2021-09-17 23:54:00', 'MSK'],
        ['2021-09-17 0:54:00', 'MSK'],
        ['2021-09-17 15:54:00', 'MSK'],
        ['2021-07-04 18:07:40', 'MSK'],
        ['2021-07-04 18:07:40', 'MSK+6'],
        ['2021-07-05 18:07:40', 'MSK+6'],
        ['1970-01-01 00:00:00', 'MSK+6'],
        ['2021-08-04 13:34:29', 'MSK'],
        ['2021-08-04 13:34:29', 'MSK-1'],
        ['2021-08-04 13:34:29', np.nan]
        ],
    columns=['created', 'shift']
    )
display(df)

Unnamed: 0,created,shift
0,2021-09-01 04:54:00,MSK
1,2021-09-26 04:54:00,MSK
2,2021-09-17 04:54:00,MSK
3,2021-09-17 05:54:00,MSK
4,2021-09-17 08:54:00,MSK
5,2021-09-17 11:54:00,MSK
6,2021-09-17 18:54:00,MSK
7,2021-09-17 21:54:00,MSK
8,2021-09-17 23:54:00,MSK
9,2021-09-17 0:54:00,MSK


In [18]:
%autoreload 2
df_time_features_loc = feature_preprocess.make_features_from_time(df, dt_target='loc', fill_shift_na=False)
tags = feature_preprocess.get_tags_from_time_features(df_time_features_loc, tags_cols=None, tags_dict=None)
display(df_time_features_loc)
display(tags)

Unnamed: 0,loc_minute,loc_hour,loc_day,loc_month,loc_weekday,loc_is_weekend,loc_days_to_weekend,loc_is_academic_year,loc_is_first_september,loc_is_week_before_first_september,loc_is_early_morning,loc_is_morning,loc_is_day,loc_is_evening,loc_is_late_evening,loc_is_night
0,54.0,4.0,0.0,9.0,2.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,54.0,4.0,0.0,9.0,6.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,54.0,4.0,0.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,54.0,5.0,0.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,54.0,8.0,0.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,54.0,11.0,1.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,54.0,18.0,0.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,54.0,21.0,0.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,54.0,23.0,0.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,54.0,0.0,0.0,9.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,loc_month_tag,loc_weekday_tag,loc_is_weekend_tag,loc_is_academic_year_tag,loc_is_first_september_tag,loc_is_week_before_first_september_tag,loc_time_of_day_tag
0,сентябрь,среда,рабочий_день,учебный_год,первое_сентября,,ночь
1,сентябрь,воскресенье,выходной,учебный_год,,,ночь
2,сентябрь,пятница,рабочий_день,учебный_год,,,ночь
3,сентябрь,пятница,рабочий_день,учебный_год,,,раннее_утро
4,сентябрь,пятница,рабочий_день,учебный_год,,,утро
5,сентябрь,пятница,рабочий_день,учебный_год,,,день
6,сентябрь,пятница,рабочий_день,учебный_год,,,вечер
7,сентябрь,пятница,рабочий_день,учебный_год,,,поздний_вечер
8,сентябрь,пятница,рабочий_день,учебный_год,,,поздний_вечер
9,сентябрь,пятница,рабочий_день,учебный_год,,,ночь


# Создание фичей для трейна и теста  и анализ месяцев

In [20]:
test_dt_loc_features = feature_preprocess.make_features_from_time(test, dt_target='loc')
test_dt_loc_tags = feature_preprocess.get_tags_from_time_features(test_dt_loc_features, tags_cols=None, tags_dict=None)

In [22]:
train_dt_loc_features = feature_preprocess.make_features_from_time(train, dt_target='loc')
train_dt_loc_tags = feature_preprocess.get_tags_from_time_features(train_dt_loc_features, tags_cols=None, tags_dict=None)

In [None]:
display(train_dt_loc_features['loc_month'].value_counts())
display(test_dt_loc_features['loc_month'].value_counts())