In [1]:
import pandas as pd
import numpy as np
import typing
import torch

from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

import phik
from phik.report import plot_correlation_matrix
from phik import report

In [48]:
train_data = pd.read_csv("../all_prepared_for_learn_prep2.csv", low_memory=False)

## EDA analisys

Анализ отчет pandas_profiling (! big size file) https://drive.google.com/file/d/1xQl3LvpX9J0G6gJoaBjzRcBFKZi6QZXz/view?usp=sharing

In [22]:
for col in train_data.select_dtypes(include=np.number).columns:
    train_data[col] = pd.to_numeric(train_data[col], downcast = 'unsigned')

In [3]:
train_data.describe()

Unnamed: 0,Segment,shift,created_year,created_month,created_day,created_hour,created_minute,created_second
count,24687390.0,24687390.0,24687389.0,24687390.0,24687390.0,24687390.0,24687390.0,24687390.0
mean,3.979878,1.053655,2021.0,8.201301,10.13165,12.32463,29.2872,29.52309
std,1.008711,1.81568,0.0,0.7329456,6.38657,5.392736,17.27915,17.31136
min,1.0,-1.0,2021.0,7.0,2.0,0.0,0.0,0.0
25%,3.0,0.0,2021.0,8.0,4.0,8.0,14.0,15.0
50%,4.0,0.0,2021.0,8.0,8.0,12.0,29.0,30.0
75%,5.0,2.0,2021.0,9.0,17.0,16.0,44.0,45.0
max,5.0,9.0,2021.0,9.0,20.0,23.0,59.0,59.0


In [4]:
def check_features(df):
    return pd.DataFrame({'unique_values': df.nunique(),'type': df.dtypes,'pct_missing': df.isna().sum()/len(df) * 100}).sort_values(by = 'pct_missing', ascending=False)

In [5]:
check_features(train_data).T

Unnamed: 0,Segment,gamecategory,subgamecategory,shift,oblast,city,os,osv,created_year,created_month,created_day,created_hour,created_minute,created_second
unique_values,5,25,53,11,82,2413,2,125,1,3,16,24,60,60
type,int64,object,object,int64,object,object,object,object,int64,int64,int64,int64,int64,int64
pct_missing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Находим признаки, имеющие максимальное значение коэффициента корреляции Пирсона с целевой переменной. 
# В качестве отсечки выбрано значение 0.4 по модулю
pearson = train_data.corr().round(2)
pearson_max_corr = (
    pearson.reset_index()
    .rename(columns={'Segment':'pearson', 'index':'feature'})
    .sort_values(by='pearson', ascending=False)
    .query('pearson > 0.4 or pearson < -0.4')
    )
pearson_max_corr

Unnamed: 0,feature,pearson,shift,created_year,created_month,created_day,created_hour,created_minute,created_second
0,Segment,1.0,0.03,,0.24,-0.07,-0.16,0.0,-0.0


Высокая взаимная корреляция (> 0.8) у следующих переменных:
- amenity и catering, healthcare, office, shop
- catering и shop
- office и shop, catering
- healthcare и catering, office, shop,
- transport и crossing

## Modelling

In [10]:
train_data.head()

Unnamed: 0,Segment,gamecategory,subgamecategory,shift,oblast,city,os,osv,created_year,created_month,created_day,created_hour,created_minute,created_second
0,4,Games,Racing,6,Забайкальский Край,Чита,android,10.0,2021,7,5,18,7,40
1,5,Games,Arcade,0,Санкт-Петербург,Санкт-Петербург,android,9.0,2021,8,4,13,34,29
2,5,Games,Puzzle,0,Татарстан,Альметьевск,android,7.1,2021,9,18,9,44,24
3,4,Games,Arcade,0,Москва,Москва,android,11.0,2021,8,2,8,52,24
4,5,Games,Card,0,Санкт-Петербург,Санкт-Петербург,android,8.1.0,2021,9,18,2,36,11


In [15]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 100 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [12]:
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

In [13]:
task = Task('multiclass', metric = 'crossentropy' )


In [50]:
target = train_data['Segment']
train_data, test_data, train_labels, test_labels = train_test_split(train_data, target, test_size = 0.3)

In [51]:
train_data.head()

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,shift,oblast,city,os,osv,created_year,created_month,created_day,created_hour,created_minute,created_second
20485632,5,7,1,19376,2,6,1024,0,112,2021,8,4,8,45,27
7483887,3,7,10,9199,0,39,1246,1,79,2021,8,5,22,23,34
14935772,4,7,0,27921,0,11,364,0,0,2021,8,5,19,12,13
7730,5,7,0,20686,0,56,1777,0,112,2021,9,19,9,25,47
16536196,5,7,38,2510,0,81,2402,1,54,2021,9,16,22,40,28


In [52]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train_data, roles={'target':'Segment'})

In [30]:
from sklearn.metrics import roc_auc_score, log_loss

In [25]:
TARGET_NAME="Segment"

In [32]:
test_pred = automl.predict(test_data)

In [53]:
oof_pred.data

array([[nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan],
       ...,
       [nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan]], dtype=float32)

In [46]:
print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred.data)))
print('TEST score: {}'.format(log_loss(test_data[TARGET_NAME].values, test_pred.data)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [36]:
for dat, df, name in zip([oof_pred, test_pred], [train_data, test_data], ['train', 'test']):
    # logging.debug('Check aucs {0}...'.format(name))
    for cl in range(5):
        sc = roc_auc_score((df[TARGET_NAME].values == cl).astype(np.float32), dat.data[:, cl])
        print('Class {0} {1} auc score: {2}'.format(cl, name, sc))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
output = pd.DataFrame({'id': test_data['id'],
                       'per_square_meter_price': automl.predict(test_data).data[:, 0]})
output['per_square_meter_price'] = math.exp(output['per_square_meter_price']) * 0.9

# output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \
#     = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9

# output.to_csv('raifHack_ki7.csv', index=False)

In [51]:
output.shape

(2974, 2)