<a href="https://colab.research.google.com/github/SorokinMaksimArtemovich/MTS-ML-CUP/blob/main/data%20preprocessing/0.6_complete_datasets_for_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import time
import gc
import pyarrow as pa
import pyarrow.parquet as pq
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline
sns.set_style('darkgrid')

In [None]:
!pip install feather-format >> none
!pip install faiss-cpu --no-cache

[0mCollecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m127.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3
[0m

In [None]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [None]:
id_to_submit = pd.read_feather('/data/submission.feather')

In [None]:
targets = pd.read_feather('/data/target_train.feather')

In [None]:
df_bace = pd.read_csv('/data/bace_preprocessing/df_bace.csv')

In [None]:
city_agg = pd.read_csv('/data/bace_preprocessing/city_agg.csv')
city_emb = pd.read_csv('/data/bace_preprocessing/city_emb.csv')
cpe_agg = pd.read_csv('/data/bace_preprocessing/cpe_agg.csv')
date_agg = pd.read_csv('/data/bace_preprocessing/date_agg.csv')
date_emb = pd.read_csv('/data/bace_preprocessing/date_emb.csv')
manuf_agg = pd.read_csv('/data/bace_preprocessing/manuf_agg.csv')
model_agg = pd.read_csv('/data/bace_preprocessing/model_agg.csv')
model_emb = pd.read_csv('/data/bace_preprocessing/model_emb.csv')
os_agg = pd.read_csv('/data/bace_preprocessing/os_agg.csv')
part_of_day = pd.read_csv('/data/bace_preprocessing/part_of_day.csv')
region_agg = pd.read_csv('/data/bace_preprocessing/region_agg.csv')
region_emb = pd.read_csv('/data/bace_preprocessing/region_emb.csv')
url_agg = pd.read_csv('/data/bace_preprocessing/url_agg.csv')

In [None]:
dayofmonth_agg_q = pd.read_csv('/data/price_and_date_preprocessing/dayofmonth_agg.csv')
dayofmonth_count_q = pd.read_csv('/data/price_and_date_preprocessing/dayofmonth_count.csv')
dayofweek_agg_q = pd.read_csv('/data/price_and_date_preprocessing/dayofweek_agg.csv')
dayofweek_sum_q = pd.read_csv('/data/price_and_date_preprocessing/dayofweek_sum.csv')
dayofweek_with_part_of_day_agg_q = pd.read_csv('/data/price_and_date_preprocessing/dayofweek_with_part_of_day_agg.csv')
month_agg_q = pd.read_csv('/data/price_and_date_preprocessing/month_agg.csv')
month_count_q = pd.read_csv('/data/price_and_date_preprocessing/month_count.csv')
price_q = pd.read_csv('/data/price_and_date_preprocessing/price.csv')
price_agg_q = pd.read_csv('/data/price_and_date_preprocessing/price_agg.csv')

In [None]:
url_factor = pd.read_csv('/data/url_factor/url_factor.csv')

In [None]:
city_agg_0 = pd.read_csv('/data/cat_encoded/city_agg_0.csv')
cpe_agg_0 = pd.read_csv('/data/cat_encoded/cpe_agg_0.csv')
day_agg_0 = pd.read_csv('/data/cat_encoded/part_of_day_agg_0.csv')
manuf_agg_0 = pd.read_csv('/data/cat_encoded/manuf_agg_0.csv')
model_agg_0 = pd.read_csv('/data/cat_encoded/model_agg_0.csv')
os_agg_0 = pd.read_csv('/data/cat_encoded/os_agg_0.csv')
region_agg_0 = pd.read_csv('/data/cat_encoded/region_agg_0.csv')
weekday_agg_0 = pd.read_csv('/data/cat_encoded/day_agg_0.csv')
day_name_part_agg_0 = pd.read_csv('/data/cat_encoded/day_name_part_agg_0.csv')

In [None]:
mean_age_and_sex_ranged = pd.read_csv('/data/target_distribution_by_urls/mean_age_and_sex_ranged.scv')
mean_age_and_sex_weighted = pd.read_csv('/data/target_distribution_by_urls/mean_age_and_sex_weighted.scv')
mean_age_emb = pd.read_csv('/data/target_distribution_by_urls/mean_age_emb.csv')

In [None]:
activity_per_day_t = pd.read_csv('/data/target_distribution_by_features/activity_per_day.csv')
city_agg_t = pd.read_csv('/data/target_distribution_by_features/city_agg.csv')
city_emb_t = pd.read_csv('/data/target_distribution_by_features/city_emb.csv')
model_agg_t = pd.read_csv('/data/target_distribution_by_features/model_agg.csv').drop('cpe_model_name', axis=1)
price_agg_t = pd.read_csv('/data/target_distribution_by_features/price_agg.csv')

In [None]:
df_danet = df_bace.merge(city_agg, how = 'inner', on = ['user_id'], suffixes=('', 'city_agg'))
df_danet = df_danet.merge(cpe_agg, how = 'inner', on = ['user_id'], suffixes=('', 'cpe_agg'))
df_danet = df_danet.merge(date_agg, how = 'inner', on = ['user_id'], suffixes=('', 'date_agg'))
df_danet = df_danet.merge(manuf_agg, how = 'inner', on = ['user_id'], suffixes=('', 'manuf_agg'))
df_danet = df_danet.merge(model_agg, how = 'inner', on = ['user_id'], suffixes=('', 'model_agg'))
df_danet = df_danet.merge(os_agg, how = 'inner', on = ['user_id'], suffixes=('', 'os_agg'))
df_danet = df_danet.merge(part_of_day, how = 'inner', on = ['user_id'], suffixes=('', 'part_of_day'))
df_danet = df_danet.merge(region_agg, how = 'inner', on = ['user_id'], suffixes=('', 'region_agg'))
df_danet = df_danet.merge(url_agg, how = 'inner', on = ['user_id'], suffixes=('', 'url_agg'))
df_danet = df_danet.merge(mean_age_emb, how = 'inner', on = ['user_id'], suffixes=('', 'mean_age_emb'))
df_danet = df_danet.merge(city_agg_t, how = 'inner', on = ['user_id'], suffixes=('', 'city_agg_t'))
df_danet = df_danet.merge(model_agg_t, how = 'inner', on = ['user_id'], suffixes=('', 'model_agg_t'))
df_danet = df_danet.merge(url_factor, how = 'inner', on = ['user_id'], suffixes=('', 'url_factor'))
df_danet = df_danet.merge(dayofweek_agg_q, how = 'inner', on = ['user_id'], suffixes=('', 'dayofweek_agg_q'))
df_danet = df_danet.merge(dayofweek_with_part_of_day_agg_q, how = 'inner', on = ['user_id'], suffixes=('', 'dayofweek_with_part_of_day_agg_q'))
df_danet = df_danet.merge(price_q, how = 'inner', on = ['user_id'], suffixes=('', 'price_q'))
df_danet = df_danet.merge(city_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'city_agg_0'))
df_danet = df_danet.merge(cpe_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'cpe_agg_0'))
df_danet = df_danet.merge(day_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'day_agg_0'))
df_danet = df_danet.merge(manuf_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'manuf_agg_0'))
df_danet = df_danet.merge(model_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'model_agg_0'))
df_danet = df_danet.merge(os_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'os_agg_0'))
df_danet = df_danet.merge(region_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'region_agg_0'))
df_danet = df_danet.merge(weekday_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'weekday_agg_0'))
df_danet = df_danet.merge(day_name_part_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'day_name_part_agg_0'))
df_danet.shape

(415317, 564)

In [None]:
df_danet.to_csv('/data/df_danet.csv', index = False)

In [None]:
del(df_danet)
gc.collect()

85

In [None]:
df_age = df_bace.merge(city_agg, how = 'inner', on = ['user_id'], suffixes=('', '_city_agg'))
df_age = df_age.merge(date_agg, how = 'inner', on = ['user_id'], suffixes=('', '_date_agg'))
df_age = df_age.merge(part_of_day, how = 'inner', on = ['user_id'], suffixes=('', '_part_of_day'))
df_age = df_age.merge(url_agg, how = 'inner', on = ['user_id'], suffixes=('', '_url_agg'))
df_age = df_age.merge(date_emb, how = 'inner', on = ['user_id'], suffixes=('', '_date_emb'))
df_age = df_age.merge(region_emb, how = 'inner', on = ['user_id'], suffixes=('', '_region_emb'))
df_age = df_age.merge(mean_age_and_sex_ranged, how = 'inner', on = ['user_id'], suffixes=('', '_ranged'))
df_age = df_age.merge(mean_age_and_sex_weighted, how = 'inner', on = ['user_id'], suffixes=('', '_weighted'))
df_age = df_age.merge(mean_age_emb, how = 'inner', on = ['user_id'], suffixes=('', '_mean_age_emb'))
df_age = df_age.merge(city_agg_t, how = 'inner', on = ['user_id'], suffixes=('', '_city_agg_t'))
df_age = df_age.merge(model_agg_t, how = 'inner', on = ['user_id'], suffixes=('', '_model_agg_t'))
df_age = df_age.merge(price_agg_t, how = 'inner', on = ['user_id'], suffixes=('', '_price_agg_t'))
df_age.shape

(415317, 251)

In [None]:
df_age.to_csv('/data/df_age.csv', index = False)

In [None]:
del(df_age)
gc.collect()

21

In [None]:
df_sex = df_bace.merge(city_agg, how = 'inner', on = ['user_id'], suffixes=('', '_city_agg'))
df_sex = df_sex.merge(date_agg, how = 'inner', on = ['user_id'], suffixes=('', '_date_agg'))
df_sex = df_sex.merge(part_of_day, how = 'inner', on = ['user_id'], suffixes=('', '_part_of_day'))
df_sex = df_sex.merge(url_agg, how = 'inner', on = ['user_id'], suffixes=('', '_url_agg'))
df_sex = df_sex.merge(price_q, how = 'inner', on = ['user_id'], suffixes=('', '_price'))
df_sex = df_sex.merge(city_emb, how = 'inner', on = ['user_id'], suffixes=('', '_city_emb'))
df_sex = df_sex.merge(date_emb, how = 'inner', on = ['user_id'], suffixes=('', '_date_emb'))
df_sex = df_sex.merge(region_emb, how = 'inner', on = ['user_id'], suffixes=('', '_region_emb'))
df_sex = df_sex.merge(mean_age_and_sex_ranged, how = 'inner', on = ['user_id'], suffixes=('', '_ranged'))
df_sex = df_sex.merge(mean_age_emb, how = 'inner', on = ['user_id'], suffixes=('', '_mean_age_emb'))
df_sex = df_sex.merge(city_agg_t, how = 'inner', on = ['user_id'], suffixes=('', '_city_agg_t'))
df_sex = df_sex.merge(model_agg_t, how = 'inner', on = ['user_id'], suffixes=('', '_model_agg_t'))
df_sex = df_sex.merge(url_factor, how = 'inner', on = ['user_id'], suffixes=('', '_url_factor'))
df_sex = df_sex.merge(day_agg_0, how = 'inner', on = ['user_id'], suffixes=('', '_day_agg_0'))
df_sex = df_sex.merge(region_agg_0, how = 'inner', on = ['user_id'], suffixes=('', '_region_agg_0'))
df_sex = df_sex.merge(day_name_part_agg_0, how = 'inner', on = ['user_id'], suffixes=('', '_day_name_part_agg_0'))
df_sex.shape

(415317, 616)

In [None]:
df_sex.to_csv('/data/df_sex.csv', index = False)

In [None]:
del(df_sex)
gc.collect()

21

In [None]:
df = df_bace.merge(activity_per_day_t, how = 'inner', on = ['user_id'], suffixes=('', 'activity_per_day_t'))
df = df.merge(city_agg_t, how = 'inner', on = ['user_id'], suffixes=('', 'city_agg_t'))
df = df.merge(city_emb_t, how = 'inner', on = ['user_id'], suffixes=('', 'city_emb_t'))
df = df.merge(model_agg_t, how = 'inner', on = ['user_id'], suffixes=('', 'model_agg_t'))
df = df.merge(price_agg_t, how = 'inner', on = ['user_id'], suffixes=('', 'price_agg_t'))
df = df.merge(mean_age_and_sex_ranged, how = 'inner', on = ['user_id'], suffixes=('', 'mean_age_and_sex_ranged'))
df = df.merge(mean_age_and_sex_weighted, how = 'inner', on = ['user_id'], suffixes=('', 'mean_age_and_sex_weighted'))
df = df.merge(mean_age_emb, how = 'inner', on = ['user_id'], suffixes=('', 'mean_age_emb'))
df = df.merge(dayofmonth_agg_q, how = 'inner', on = ['user_id'], suffixes=('', 'dayofmonth_agg_q'))
df = df.merge(dayofmonth_count_q, how = 'inner', on = ['user_id'], suffixes=('', 'dayofmonth_count_q'))
df = df.merge(dayofweek_agg_q, how = 'inner', on = ['user_id'], suffixes=('', 'dayofweek_agg_q'))
df = df.merge(dayofweek_sum_q, how = 'inner', on = ['user_id'], suffixes=('', 'dayofweek_sum_q'))
df = df.merge(dayofweek_with_part_of_day_agg_q, how = 'inner', on = ['user_id'], suffixes=('', 'dayofweek_with_part_of_day_agg_q'))
df = df.merge(month_agg_q, how = 'inner', on = ['user_id'], suffixes=('', 'month_agg_q'))
df = df.merge(month_count_q, how = 'inner', on = ['user_id'], suffixes=('', 'month_count_q'))
df = df.merge(price_q, how = 'inner', on = ['user_id'], suffixes=('', 'price_q'))
df = df.merge(price_agg_q, how = 'inner', on = ['user_id'], suffixes=('', 'price_agg_q'))
df = df.merge(city_agg, how = 'inner', on = ['user_id'], suffixes=('', 'city_agg'))
df = df.merge(city_emb, how = 'inner', on = ['user_id'], suffixes=('', 'city_emb'))
df = df.merge(cpe_agg, how = 'inner', on = ['user_id'], suffixes=('', 'cpe_agg'))
df = df.merge(date_agg, how = 'inner', on = ['user_id'], suffixes=('', 'date_agg'))
df = df.merge(date_emb, how = 'inner', on = ['user_id'], suffixes=('', 'date_emb'))
df = df.merge(manuf_agg, how = 'inner', on = ['user_id'], suffixes=('', 'manuf_agg'))
df = df.merge(model_agg, how = 'inner', on = ['user_id'], suffixes=('', 'model_agg'))
df = df.merge(model_emb, how = 'inner', on = ['user_id'], suffixes=('', 'model_emb'))
df = df.merge(os_agg, how = 'inner', on = ['user_id'], suffixes=('', 'os_agg'))
df = df.merge(part_of_day, how = 'inner', on = ['user_id'], suffixes=('', 'part_of_day'))
df = df.merge(region_agg, how = 'inner', on = ['user_id'], suffixes=('', 'region_agg'))
df = df.merge(region_emb, how = 'inner', on = ['user_id'], suffixes=('', 'region_emb'))
df = df.merge(url_agg, how = 'inner', on = ['user_id'], suffixes=('', 'url_agg'))
df = df.merge(url_factor, how = 'inner', on = ['user_id'], suffixes=('', 'url_factor'))
df = df.merge(city_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'city_agg_0'))
df = df.merge(cpe_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'cpe_agg_0'))
df = df.merge(day_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'day_agg_0'))
df = df.merge(manuf_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'manuf_agg_0'))
df = df.merge(model_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'model_agg_0'))
df = df.merge(os_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'os_agg_0'))
df = df.merge(region_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'region_agg_0'))
df = df.merge(weekday_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'weekday_agg_0'))
df = df.merge(day_name_part_agg_0, how = 'inner', on = ['user_id'], suffixes=('', 'day_name_part_agg_0'))
df.shape

(415316, 905)

In [None]:
del(df_bace)
del(activity_per_day_t)
del(city_agg_t)
del(city_emb_t)
del(model_agg_t)
del(price_agg_t)
del(mean_age_and_sex_ranged)
del(mean_age_and_sex_weighted)
del(mean_age_emb)
del(dayofmonth_agg_q)
del(dayofmonth_count_q)
del(dayofweek_agg_q)
del(dayofweek_sum_q)
del(dayofweek_with_part_of_day_agg_q)
del(month_agg_q)
del(month_count_q)
del(price_q)
del(price_agg_q)
del(city_agg)
del(city_emb)
del(cpe_agg)
del(date_agg)
del(date_emb)
del(manuf_agg)
del(model_agg)
del(model_emb)
del(os_agg)
del(part_of_day)
del(region_agg)
del(region_emb)
del(url_agg)
del(url_factor)
del(city_agg_0)
del(cpe_agg_0)
del(day_agg_0)
del(manuf_agg_0)
del(model_agg_0)
del(os_agg_0)
del(region_agg_0)
del(weekday_agg_0)
del(day_name_part_agg_0)
gc.collect()

21

In [None]:
importance_feature_age = pd.read_csv('/data/utils/importance_feature_age.csv')
importance_feature_sex = pd.read_csv('/data/utils/importance_feature_sex.csv')

In [None]:
df_sex_1 = df[importance_feature_sex['feat'].append(pd.Series('user_id'))]

In [None]:
df_sex_1.to_csv('/data/df_sex_1.csv', index = False)

In [None]:
del(df_sex_1)
gc.collect()

21

In [None]:
df_age_1 = df[importance_feature_age['feat'].append(pd.Series('user_id'))]

In [None]:
df_age_1.to_csv('/data/df_age_1.csv', index = False)

In [None]:
del(df_age_1)
gc.collect()

21