In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
df = pd.read_parquet('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_features_2016_2020_v3.parquet.gzip')

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df=df.rename(columns = {'band_x':'band', 'vg_state_x':'vg_state','vg_raum_wo_stopwords':'venue'})

## 1. Merge the extracted features into training set and testing set

In [None]:
venue_train_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_train_venue_descr_stats_2016_2020_v2.parquet.gzip')
band_train_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_train_band_descr_stats_2016_2020_v2.parquet.gzip')
promoter_train_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_train_promoter_descr_stats_2016_2020_v2.parquet.gzip')

promoter_rename = {'promoter_transform_5%' : 'promoter_5%', 'promoter_transform_10%' : 'promoter_10%', 
'promoter_transform_15%' : 'promoter_15%', 'promoter_transform_20%': 'promoter_20%', 
'promoter_transform_25%': 'promoter_25%', 'promoter_transform_30%': 'promoter_30%',
'promoter_transform_35%': 'promoter_35%', 'promoter_transform_40%': 'promoter_40%', 
'promoter_transform_45%': 'promoter_45%', 'promoter_transform_50%': 'promoter_50%', 
'promoter_transform_55%': 'promoter_55%', 'promoter_transform_60%': 'promoter_60%',
'promoter_transform_65%': 'promoter_65%', 'promoter_transform_70%': 'promoter_70%', 
'promoter_transform_75%': 'promoter_75%', 'promoter_transform_80%': 'promoter_80%', 
'promoter_transform_85%': 'promoter_85%', 'promoter_transform_90%': 'promoter_90%',
'promoter_transform_95%': 'promoter_95%', 'promoter_transform_count': 'promoter_count', 
'promoter_transform_max': 'promoter_max', 'promoter_transform_min': 'promoter_min', 
'promoter_transform_mean': 'promoter_mean', 'promoter_transform_std': 'promoter_std'}

if venue_train_stat_path.exists() and band_train_stat_path.exists() and promoter_train_stat_path.exists():
    df_train_venue = pd.read_parquet(venue_train_stat_path)
    df_train_band = pd.read_parquet(band_train_stat_path)
    df_train_promoter = pd.read_parquet(promoter_train_stat_path)

    df_train_promoter=df_train_promoter.rename(columns = promoter_rename)

    df_train_merged_descr_stat = df_train_venue.merge(df_train_band, how='left', on='ID').merge(df_train_promoter, how='left', on='ID')

display(df_train_merged_descr_stat.head(5))
print(df_train_merged_descr_stat.shape)

In [None]:
venue_test_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_test_venue_descr_stats_2016_2020_v2.parquet.gzip')
band_test_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_test_band_descr_stats_2016_2020_v2.parquet.gzip')
promoter_test_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_test_promoter_descr_stats_2016_2020_v2.parquet.gzip')


if venue_test_stat_path.exists() and band_test_stat_path.exists() and promoter_test_stat_path.exists():
    df_test_venue = pd.read_parquet(venue_test_stat_path)
    df_test_band = pd.read_parquet(band_test_stat_path)
    df_test_promoter = pd.read_parquet(promoter_test_stat_path)

    df_test_promoter=df_test_promoter.rename(columns = promoter_rename)

    df_test_merged_descr_stat = df_test_venue.merge(df_test_band, how='left', on='ID').merge(df_test_promoter, how='left', on='ID')

display(df_test_merged_descr_stat.head(5))
print(df_test_merged_descr_stat.shape)

In [None]:
features = [
    'vg_inkasso', 'veranst_segment', 'vg_state',
    'vg_datum_year', 'vg_datum_month', 'vg_datum_day_of_week', 'vg_datum_season',
    'tarif_bez'
]

df_features = df[features].copy()

In [None]:
le = LabelEncoder()

In [None]:
df_train = df_train_merged_descr_stat.merge(df_features, how='left', on='ID')

df_train['vg_state'] = le.fit_transform(df_train['vg_state'])

print(df_train.shape)
display(df_train.head(5))

In [None]:
df_test = df_test_merged_descr_stat.merge(df_features, how='left', on='ID')

df_test['vg_state'] = le.fit_transform(df_test['vg_state'])

print(df_test.shape)
display(df_test.head(5))

In [None]:
print(df_test.columns.tolist())

## Replace NaN values with 0

In [None]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [None]:
# Check is there are any NaN values in any of the columns
df_train.isna().any()

In [None]:
# Check is there are any NaN values in any of the columns
df_test.isna().any()

## Export training and testing dataframe, each containing 80 features

In [None]:
df_train.to_parquet('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_training_features_2016_2020_v2.parquet.gzip',compression='gzip')

In [None]:
df_test.to_parquet('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_testing_features_2016_2020_v2.parquet.gzip',compression='gzip')

# =======================================================

# 2. Merge all leave-one-out-target-encoding features

In [None]:
venue_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_venue_descr_stats_2016_2020_v3.parquet.gzip')
band_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_band_descr_stats_2016_2020_v3.parquet.gzip')
promoter_stat_path = Path('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_promoter_descr_stats_2016_2020_v3.parquet.gzip')


if venue_stat_path.exists() and band_stat_path.exists() and promoter_stat_path.exists():
    df_venue = pd.read_parquet(venue_stat_path)
    df_band = pd.read_parquet(band_stat_path)
    df_promoter = pd.read_parquet(promoter_stat_path)

    df_merged_descr_stat = df_venue.merge(df_band, how='left', on='ID').merge(df_promoter, how='left', on='ID')

display(df_merged_descr_stat.head(5))
print(df_merged_descr_stat.shape)

In [None]:
features = [
    'vg_inkasso', 'veranst_segment', 'vg_state',
    'vg_datum_year', 'vg_datum_month', 'vg_datum_day_of_week', 'vg_datum_season',
    'tarif_bez'
]

df_features = df[features].copy()

In [None]:
le = LabelEncoder()

In [None]:
df_merged = df_merged_descr_stat.merge(df_features, how='left', on='ID')

df_merged['vg_state'] = le.fit_transform(df_merged['vg_state'])
# df_merged['tarif_bez'] = le.fit_transform(df_merged['tarif_bez'])

# all_tarifs_le = [e for e in df_merged['tarif_bez']]

# tarif_classes=le.inverse_transform(all_tarifs_le).tolist()
# index_of_tarif = tarif_classes.index('U-ST I (MUSIKER) NL')
# print('The index is: ',index_of_tarif)
# print('The label encoded value is: ',all_tarifs_le[index_of_tarif])

print(df_merged.shape)
display(df_merged.head(5))

In [None]:
print(df_merged.columns.tolist())
print(len(df_merged.columns.tolist()))

## Replace NaN values with 0

In [None]:
df_merged = df_merged.fillna(0)

## Check for NaN values in all columns

In [None]:
# Check is there are any NaN values in any of the columns
df_merged.isna().any()

## Export the merged dataframe features, each containing 80 features

In [None]:
# without tarif_bez column
df_merged.to_parquet('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_features_2016_2020_v4.parquet.gzip',compression='gzip')

In [None]:
# with tarif_bez column
df_merged.to_parquet('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_features_2016_2020_v5.parquet.gzip',compression='gzip')