In [7]:
#root_dir = 'drive/MyDrive/Colab Notebooks/hachathon_hyper_ad_tech'

In [8]:
%load_ext autoreload

import os
import sys
import joblib
import pickle
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_object_dtype

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import StratifiedKFold

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

root_dir = os.path.abspath('..')
sys.path.append(os.path.join(root_dir, 'src/'))

import feature_preprocess
import eda_functions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
DATA_FOLDER = os.path.join(root_dir, 'hackathon/')
LGBM_MODELS_FOLDER = os.path.join(root_dir, 'models/lgbm/')
MODES_FOLDER = os.path.join(root_dir, 'models/imputer_modes/')
OUTPUT_FOLDER = os.path.join(root_dir, 'output/')
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
SEGMENTS_FILE = 'Segments.xlsx'
CITIES_FILE = 'data_cities.csv'
PATH_CITIES = os.path.join(root_dir, 'data/' + CITIES_FILE)
RANDOM_STATE = 27

FRAC = 0.05

pd.set_option('display.max_columns', None)

TIME_TAGS_COLS = [
    'weekday',
    'is_weekend',
    'is_academic_year',
    'is_early_morning',
    'is_morning',
    'is_day',
    'is_evening',
    'is_late_evening',
    'is_night'
]

In [10]:
def preprocess_text_features(df, text_features=None, os_version='osv'):
    
    df = df.copy()
    if text_features is None:
        text_features = ['gamecategory', 'subgamecategory', 'bundle', 'oblast', 'city', 'os']

    for ftr in text_features:
        df[ftr] = df[ftr].str.upper()
    df[os_version] = df[os_version].str.replace(r'[^0-9]', '', regex=True)
    df[os_version] = df[os_version].str.rstrip('0')

    return df

In [5]:
PATH_CITIES = 'drive/MyDrive/Colab Notebooks/hachathon_hyper_ad_tech/data_cities.csv'

In [6]:
def predict_on_test():
    print('reading test...')
    test_raw = pd.read_csv(os.path.join(DATA_FOLDER, TEST_FILE))
    print(test_raw.shape)
    print('preprocessing test...')
    test_raw_preprocessed = preprocess_text_features(test_raw)
    del test_raw
    with open(os.path.join(MODES_FOLDER, 'mode_subgamecategory.pkl'), mode='rb') as file:
        mode_sub = pickle.load(file)
    with open(os.path.join(MODES_FOLDER, 'mode_gamecategory.pkl'), mode='rb') as file:
        mode_game = pickle.load(file)
    test_raw_preprocessed['subgamecategory'] = feature_preprocess.impute_column_test(test_raw_preprocessed, 'subgamecategory', mode_sub)
    test_raw_preprocessed['gamecategory'] = feature_preprocess.impute_column_test(test_raw_preprocessed, 'gamecategory', mode_game)
    test_raw_preprocessed = feature_preprocess.make_features_from_cities(test_raw_preprocessed, PATH_CITIES)

    test_raw_time_ftrs = \
      feature_preprocess.make_features_from_time(
        test_raw_preprocessed,
        dt_target='loc',
        datetime_col_msk='created',
        shift_col = 'shift',
        fill_shift_na=False,
        shift_filler='MSK',
        dt_format = '%Y-%m-%d %H:%M:%S'
        )
    print('make_features_from_time done')

    test_raw_time_tags = feature_preprocess.get_tags_from_time_features(
        test_raw_time_ftrs, 
        tags_cols=TIME_TAGS_COLS, 
        tags_dict=None)

    print('get_tags_from_time_features done')

    test_raw_all_ftrs = pd.concat([test_raw_preprocessed, test_raw_time_ftrs.reset_index(drop=True), test_raw_time_tags.reset_index(drop=True)], axis=1)
    test_raw_all_ftrs.drop(columns=['created', 'shift'], inplace=True)

    print('dataset prepared successfully')

    os_tags = feature_preprocess.phone_tags(test_raw_all_ftrs)
    osv_new = test_raw_all_ftrs['osv'].astype(str).apply(feature_preprocess.get_version_float)
    osv_new.columns=['osv_fixed']
    test_raw_all_ftrs = pd.concat([test_raw_all_ftrs, os_tags, osv_new], axis=1)

    test_raw_all_ftrs['osv'] = test_raw_all_ftrs['osv'].replace('',0)
    target_names = ['Segment_1', 'Segment_2', 'Segment_3', 'Segment_4', 'Segment_5']
    category_cols = ['os', 'bundle', 'loc_is_weekend_tag', 'loc_weekday_tag', 'city', 'gamecategory', 'loc_is_academic_year_tag', 'subgamecategory', 'type', 'oblast', 'loc_time_of_day_tag', 'new_phone', 'os_']
    for col in category_cols:
      test_raw_all_ftrs[col] = test_raw_all_ftrs[col].astype('category')

    for col in ['osv','timezone']:
        test_raw_all_ftrs[col] = test_raw_all_ftrs[col].astype('float')

    loaded_models = {}
    version = 'v2.0'
    for targ in target_names:
        model_name = f'lgbm_segment_{targ}_{version}.pkl'
        loaded_models[targ] = joblib.load(os.path.join(LGBM_MODELS_FOLDER, model_name))

    test_predic = {}
    for targ in target_names:
        test_predic[targ] = loaded_models[targ].predict(test_raw_all_ftrs)

    test_predictions = pd.DataFrame(dtype=np.float64)
    for targ in target_names:
        test_predictions[f'{targ}_proba'] = test_predic[targ]

    postfix = '_lgbm_Marina_v2'
    test_predictions.to_csv(os.path.join(OUTPUT_FOLDER, f'test_predictions{postfix}.csv'), index=False)
    display(test_predictions)

In [None]:
predict_on_test()

reading test...
(11213629, 9)
preprocessing test...
