In [None]:
# Enable autoreload
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from tqdm.auto import tqdm
import pickle
from pathlib import Path
import numpy as np
from math import floor
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

pd.set_option('max_columns',None)
tqdm.pandas()

# Import Data

In [None]:
data_path = Path('./data/export_rech_2016_2020.pkl.bz2')

if data_path.exists():
    df_raw = pd.read_pickle(data_path)
    
display(df_raw.head(10))
display(df_raw.info())

In [None]:
df_featurize = df_raw[[
    'VG_INKASSO', 'VERANST_SEGMENT', 'VG_ORT', 'VG_RAUM', 'VG_DATUM_VON', 'KAPELLE_NAME',
    'VERANST_NAME', 'VERANST_PLZ', 'TARIF_NR', 'LOCATION', 'BAND', 'PROMOTER'
]].copy()

display(df_featurize.head(10))

# Add Descriptive Statistics

In [None]:
data_path_descr_stat=Path('./data/export_descr_stat_2016_2020.pkl.bz2')

data_path_loc = Path('./data/export_descr_stat_location_2016_2020.pkl.zip')
data_path_band = Path('./data/export_descr_stat_band_2016_2020.pkl.zip')
data_path_promoter = Path('./data/export_descr_stat_promoter_2016_2020.pkl.zip')

stat_var = ['LOCATION', 'BAND', 'PROMOTER']
df_stat = {}


def get_descr_stat(row):
    def descr_stat(row, var):
        result = pd.Series(dtype='float64')

        if not pd.isnull(row[var]):
            inkasso = df_stat[var].loc[row[var]].copy()
            if len(inkasso) > 1:
                inkasso.remove(row['VG_INKASSO'])
                result = pd.Series(inkasso).describe(percentiles=percentiles)
                result = result.add_prefix('{}_'.format(var))

        result.name = row.name
        return result

    descr_stat_result = pd.Series(dtype='float64')
    for v in stat_var:
        descr_stat_result = descr_stat_result.append(descr_stat(row, v))

    return descr_stat_result


if data_path_descr_stat.exists() :
    df_featurize_descr_stat = pd.read_pickle(data_path_descr_stat)
    
elif data_path_loc.exists() and data_path_band.exists() and data_path_promoter.exists():
    df_loc = pd.read_pickle(data_path_loc)
    df_band = pd.read_pickle(data_path_band)
    df_promoter = pd.read_pickle(data_path_promoter)

    df_featurize_descr_stat = df_loc.merge(df_band, how='outer', on='ID').merge(df_promoter,
                                                                                how='outer',
                                                                                on='ID')
else:

    for v in tqdm(stat_var):
        df_stat[v] = df_featurize.groupby(v)['VG_INKASSO'].apply(list)

    percentiles = [round(x, 2) for x in np.linspace(0, 1, 21)[1:-1].tolist()]

    df_featurize_descr_stat = df_featurize.progress_apply(get_descr_stat, axis=1)
    df_featurize_descr_stat.to_pickle('./data/export_descr_stat_2016_2020.pkl.bz2', protocol=4)

display(df_featurize_descr_stat)

# NLTK Analysis

In [None]:
nltk.download('stopwords')

## VG_RAUM

In [None]:
df_raw['VG_RAUM_clean']=df_raw['VG_RAUM'].astype(str).fillna('').map(lambda x: re.sub(r'\W+', ' ', x))
df_raw['VG_RAUM_tokenized']=df_raw['VG_RAUM_clean'].apply(word_tokenize, language='german')

flat_list = [item for sublist in df_raw['VG_RAUM_tokenized'].tolist() for item in sublist]

german_stop_words = stopwords.words('german')

text_wo_stop_words = [word for word in flat_list if word.lower() not in german_stop_words]

stopwords = [
    'ST', 'FREIEN', 'BAD', 'HAUS', 'EV', 'BERLIN', 'KATH', 'S', 'HOF', 'ALTE', 'MITTE', 'LUTH', 'MUENCHEN',
    'IRISH', 'MUSIK', 'KULTUR', 'FUER', 'EVANG', 'MARITIM', 'KOELN', 'U', 'TURN', 'E', 'STUTTGART', 'ALTES',
    'A', 'GASTES', 'THE', 'EUROPA', 'HANNOVER', 'STADT', 'BADEN', 'NUERNBERG', 'HAMBURG', 'NEUE',
    'EVANGELISCHE', 'LEIPZIG', 'B', 'DRESDEN', 'BREMEN', 'PETER', '1','ALTER'
]

text_wo_stop_words_2 = [word for word in text_wo_stop_words if word not in stopwords]

text_wo_stop_words_3 = ['KIRCHE' if 'KIRCHE' in word else word for word in text_wo_stop_words_2 ]

display(text_wo_stop_words_3)

In [None]:
fdist = FreqDist(text_wo_stop_words_3)
print(fdist)
#display(fdist.most_common(60))

fdist.plot(30,cumulative=False)
plt.show()

In [None]:
most_common_terms_vg_raum = [i[0] for i in fdist.most_common(30)] 
df_featurize_vg_raum_keywords = pd.DataFrame(df_raw['VG_RAUM_clean'])

for term in most_common_terms_vg_raum :
    df_featurize_vg_raum_keywords['location_'+term]=df_featurize_vg_raum_keywords['VG_RAUM_clean'].apply(lambda x: 1 if term in x else 0)

display(df_featurize_vg_raum_keywords.head(10))

In [None]:
df_featurize_vg_raum_keywords.to_pickle('./data/export_vg_raum_keywords_2016_2020.pkl.bz2', protocol=4)

## Kapelle Name

In [None]:
from nltk.corpus import stopwords

df_raw['KAPELLE_NAME_clean']=df_raw['KAPELLE_NAME'].fillna('').map(lambda x: re.sub(r'\W+', ' ', x))
df_raw['KAPELLE_NAME_tokenized'] = df_raw['KAPELLE_NAME_clean'].apply(word_tokenize, language='german')

flat_list_kapelle = [item for sublist in df_raw['KAPELLE_NAME_tokenized'].tolist() for item in sublist]

german_stop_words = stopwords.words('german')
english_stop_words = stopwords.words('english')

kapelle_wo_stop_words = [word for word in flat_list_kapelle if word.lower() not in german_stop_words]

kapelle_wo_stop_words_1 = [word for word in kapelle_wo_stop_words if word.lower() not in english_stop_words]

stopwords = [
    'BAD', 'E', 'DE', 'BIG', 'B', 'PETER', 'MARTIN', 'MICHAEL', 'MUSIC', 'MUSIK', 'THOMAS', 'BLUE', 'STEFAN',
    'N', 'ANDREAS', 'V', 'FRANK', 'U', 'KLAUS', 'TOM', 'G', 'PAUL', 'HANS', 'CHRISTIAN', 'WOLFGANG'
]
 
kapelle_wo_stop_words_2 = [word for word in kapelle_wo_stop_words_1 if word not in stopwords]

display(kapelle_wo_stop_words_2)

In [None]:
fdist_kapelle = FreqDist(kapelle_wo_stop_words_2)
print(fdist_kapelle)
#display(fdist_kapelle.most_common(60))

fdist_kapelle.plot(30,cumulative=False)
plt.show()

In [None]:
most_common_terms_kapelle = [i[0] for i in fdist_kapelle.most_common(30)] 
df_featurize_kapelle_keywords = pd.DataFrame(df_raw['KAPELLE_NAME_clean'])

for term in most_common_terms_kapelle :
    df_featurize_kapelle_keywords['band_'+term]=df_featurize_kapelle_keywords['KAPELLE_NAME_clean'].apply(lambda x: 1 if term in x else 0)

display(df_featurize_kapelle_keywords.head(20))

In [None]:
df_featurize_kapelle_keywords.to_pickle('./data/export_kapelle_keywords_2016_2020.pkl.bz2', protocol=4)

In [None]:
display(df_featurize_kapelle_keywords.info())
display(df_featurize.info())

# Time

In [None]:
df_featurize['VG_DATUM_YEAR'] = df_featurize['VG_DATUM_VON'].dt.year
df_featurize['VG_DATUM_MONTH'] = df_featurize['VG_DATUM_VON'].dt.month
df_featurize['VG_DATUM_DAY_OF_WEEK']=df_featurize['VG_DATUM_VON'].dt.dayofweek

display(df_featurize.head(5))

# Merge all Features

In [None]:
#df_featurize_vg_raum_keywords=pd.read_pickle('./data/export_vg_raum_keywords_2016_2020.pkl.bz2')
#df_featurize_kapelle_keywords=pd.read_pickle('./data/export_kapelle_keywords_2016_2020.pkl.bz2')

In [None]:
df_featurized_merged = df_featurize.merge(df_featurize_descr_stat, how='left', on='ID') 
df_featurized_merged = df_featurized_merged.merge(df_featurize_vg_raum_keywords, how='left', on='ID')
df_featurized_merged = df_featurized_merged.merge(df_featurize_kapelle_keywords, how='left', on='ID')

In [None]:
df_featurized_merged=df_featurized_merged.rename(str.lower, axis='columns')
df_featurized_merged=df_featurized_merged.rename(columns={"vg_inkasso": "amount", "veranst_segment": "amount_segment", "tarif_nr": "tariff_id"})
df_featurized_merged=df_featurized_merged.drop(columns=['vg_raum_clean', 'kapelle_name_clean'])

In [None]:
df_featurized_merged.info()

In [None]:
df_featurized_merged.to_pickle('./data/export_features_2016_2020.pkl.bz2', protocol=4)