In [None]:
import pandas as pd
import pickle
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from collections import Counter

import numpy as np
import spacy
from string import punctuation
from faker import Faker

In [None]:
df = pd.read_pickle('C:/Saravana/Data/Raw/export-cleansed-4851f054c66579780503d70880731802.pkl.bz2')

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
display(df.head())

In [None]:
df.VERANST_SEGMENT.unique()

In [None]:
len(df.index)

In [None]:
# Convert all the event segments > 3 to a event segment - 4
df.loc[df['VERANST_SEGMENT'] > 3, 'VERANST_SEGMENT'] = 4

In [None]:
# Filter segment 2(0-50 euros) & segment 3(50-100 euros) & segment 4(>100 euros)
allclaims_df = df.query('VERANST_SEGMENT <= 4')
len(allclaims_df.index)

### Unique values

In [None]:
print('Unique bands: ',len(df.BAND.unique()))
print('Unique locations: ',len(df.VG_ORT.unique()))
print('Unique venues: ',len(df.VG_RAUM.unique()))
print('Unique Promoters: ',len(df.PROMOTER.unique()))
print('Unique tariffs: ',len(df.TARIF_BEZ.unique()))

### Visualize missing data

In [None]:
missing_data_in_df = pd.DataFrame(
    {
        "Attributes": ["imp_id", "gj", "import", "mufo_referenz_n", "barcode_nr", "veranst_segment", "rekla_jn", "vg_datum_von", "vg_ort",
                      "vg_raum", "nutzfall", "nutzfall_raum", "musikleiter_name", "kapelle_name", "tarif_nr", "tarif_bez", 
                      "nutzfall_nr", "vg_inkasso", "inkasso_netto", "inkasso_brutto", "veranst_geschaeftszeichen", "veranst_name",
                      "veranst_strasse", "veranst_plz", "veranst_ort", "nutzliznehm_geschaeftszeichen", "nutzliznehm_name", "nutzliznehm_vorname",
                      "nutzliznehm_strasse", "nutzliznehm_plz", "nutzliznehm_ort", "location", "band", "promoter"], 
        "missing (in million(s))": [0, 0, 0, 3205313, 0, 0, 0, 0, 222, 155, 0, 151, 2104917, 1477728, 0, 0, 0, 0, 2933130, 2933130, 0, 121, 20393, 
                    1225, 697, 0, 1038426, 2941456, 1052479, 1038887, 1038426, 151, 1453088, 121],
    }
)

In [None]:
_, ax = plt.subplots()
missing_data_in_df.plot.bar(x="Attributes", ax= ax)
ax.legend(["Empty values(in million(s))"])

## Class distribution plot  

In [None]:
fig, ax = plt.subplots()

# fads = allclaims_df.copy()
# fads.rename(columns = {'test':'TEST'}, inplace = True)

allclaims_df.VERANST_SEGMENT.value_counts().plot(ax=ax, kind='bar', xlabel='Event Segments', ylabel='Frequency')
# ax.legend(["2.0 - Class 0(0-50€)", "3.0 - Class 1(50€-100€)", "4.0 - Class 2(>100€)"])
# ax.legend(["3.0 - Class 1(50€-100€)"])
# ax.legend(["4.0 - Class 2(>100€)"])
ax.legend(['Frequency of classes'])
plt.show()

## Remove missing 

Remove missing values from VG_ORT

In [None]:
allclaims_df = allclaims_df[allclaims_df['VG_ORT'].isnull()==False]
len(allclaims_df.index)

Remove missing values from BAND

In [None]:
allclaims_df = allclaims_df[allclaims_df['BAND'].isnull()==False]
len(allclaims_df.index)

Remove missing values from PROMOTER

In [None]:
allclaims_df = allclaims_df[allclaims_df['PROMOTER'].isnull()==False]
len(allclaims_df.index)

In [None]:
print('Unique locations: ',len(allclaims_df.VG_ORT.unique()))

In [None]:
# IMP_ID                                 
# GJ                               GeschäftsJahr      
# IMPORT                           Distribution where the data came from      
# MUFO_REFERENZ_N                  
# BARCODE_NR                             
# VERANST_SEGMENT                  Event segment
# REKLA_JN                         Reclamation 

# VG_DATUM_VON                     Event Date
# VG_ORT                           Event place
# VG_RAUM                          Event room

# NUTZFALL                         Usage of event  
# NUTZFALL_RAUM                    Usage Room or Music hall where the music is used

# MUSIKLEITER_NAME                 Music Leader name
# KAPELLE_NAME                     Chapel name

# TARIF_NR                         Tariff Number
# TARIF_BEZ                        Tariff Bez
# NUTZFALL_NR                      Usecase Number

# VG_INKASSO                       Event collection
# INKASSO_NETTO                    Net-Collection
# INKASSO_BRUTTO                   Gross-Collection
# VERANST_GESCHAEFTSZEICHEN        Event business sign or mark 
# VERANST_NAME                     Event name
# VERANST_STRASSE                  Event street
# VERANST_PLZ                      Event post code
# VERANST_ORT                      Event place

# # Nutzungs Lizenznehmer - someone who has got the license(Usage licence) for the music works
# NUTZLIZNEHM_GESCHAEFTSZEICHEN    
# NUTZLIZNEHM_NAME                 name of person who aquired usage license
# NUTZLIZNEHM_VORNAME              surname of person who aquired usage license
# NUTZLIZNEHM_STRASSE              street of person who aquired usage license
# NUTZLIZNEHM_PLZ                  postcode of person who aquired usage license
# NUTZLIZNEHM_ORT                  place of person who aquired usage license

# LOCATION                         location - VG_ORT + VG_RAUM
# BAND                             band - KAPELLE_NAME and empty rows of kapelle name is filled with MUSIKLEITER_NAME
# PROMOTER                         promoter - VERANST_NAME + VERANST_PLZ

# VG_RAUM = 'IM FREIEN'(In Outside) or Name of the City


## Scrapping German Cities and States from Wikipedia

In [None]:
import requests
from bs4 import BeautifulSoup as bs

In [None]:
# Load the webpage
r = requests.get("https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_Germany")

# Convert the webpage content to soup object
webpage = bs(r.content)

Remove special characters and convert to uppercase

In [None]:
def remove_special_char_convert_to_uppercase(text):
    each = text.replace('ß','SS')
    each = each.upper()
    each = each.replace('Ä', 'AE')
    each = each.replace('Ö', 'OE')
    each = each.replace('Ü', 'UE')
    return each

In [None]:
c_names = ["City", "State"]
l = []
for i in range(0,25):
    table = webpage.select("table")[i]
    list = table.select("li")
    for c in list:
        each = remove_special_char_convert_to_uppercase(c.get_text(','))
        texts = each.split(',')
        city = texts[0]
        if city == 'MUNICH': city = 'MUENCHEN'
        if city == 'COLOGNE': city = 'KOELN' 
        if city == 'NUREMBERG': city = 'NUERNBERG'
        if city == 'HANOVER': city = 'HANNOVER'
         
        state = texts[1].replace('(','').replace(')','')
        l.append([city , state])
print(l[0])

In [None]:
cities_states_de_df = pd.DataFrame(l, columns=c_names)
cities_states_de_df.head()

In [None]:
cities_states_de_df[cities_states_de_df['City']=='KOELN']

VG_ORT

In [None]:
# Some VG_ORT values contains of format-1:<cityname, specific region name>
# Replace all the places containing of format-1 with cityname
allPlaces=allclaims_df['VG_ORT'].tolist()
place_indicies_with_comma = [i for i in range(len(allPlaces)) if ',' in allPlaces[i]]

iter_index = 0
for each_index in allclaims_df.index[allclaims_df['VG_ORT'].str.contains(',') == True].tolist():
    cityname = allclaims_df.at[each_index, 'VG_ORT'].split(r",")[0]
    allPlaces[place_indicies_with_comma[iter_index]] = cityname
    iter_index += 1

allclaims_df['VG_ORT'] = pd.Categorical(allPlaces, ordered = False)

len(allclaims_df[allclaims_df['VG_ORT'].str.contains(',') == True])

Matching the state for each city or town (location) in VG_ORT

In [None]:
allclaims_df['vg_state'] = ''

In [None]:
for each in cities_states_de_df.itertuples() :
    city = each.City
    state = each.State
    allclaims_df.loc[allclaims_df['VG_ORT'] == city, 'vg_state'] = state.strip()


Load the Municipality and states in Germany that are scraped from WikiData

In [None]:
wiki_municipality_df = pd.read_csv('C:/Saravana/Projects/Intellizenz/intellizenz-model-training/data/submunicipality_municipality_district_state_germany_v3.csv')
wiki_municipality_df[['stateLabel','municipalityLabel','submunicipalityLabel']]

In [None]:
for each in wiki_municipality_df.itertuples() :
    municipality = remove_special_char_convert_to_uppercase(each.municipalityLabel)
    state = remove_special_char_convert_to_uppercase(each.stateLabel)
    submunicipality = remove_special_char_convert_to_uppercase(str(each.submunicipalityLabel))
    allclaims_df.loc[allclaims_df['VG_ORT'] == municipality, 'vg_state'] = state.strip()
    allclaims_df.loc[allclaims_df['VG_ORT'] == submunicipality, 'vg_state'] = state.strip()

In [None]:
# Checking the empty values in vg_state column
empty_vg_state = allclaims_df[allclaims_df['vg_state'] == '']
print(len(empty_vg_state))
print('Unique : {}'.format(len(empty_vg_state.VG_ORT.unique())))

## Remove empty values from vg_state

In [None]:
allclaims_df = allclaims_df[allclaims_df['vg_state'] != '']
len(allclaims_df.index)

In [None]:
flat_list_vg_state = [item for item in allclaims_df['vg_state']]
fdist_vg_state = FreqDist(flat_list_vg_state)
fdist_vg_state.plot(cumulative=False)

## Visualize the percentage of events in different states


In [None]:
state_categories_df = allclaims_df['vg_state'].value_counts(normalize=True)
state_categories_df = state_categories_df.mul(100).rename('Percent').reset_index()
state_categories_df.rename(columns = {'index':'State'}, inplace = True)

g = sns.catplot(x='State', y='Percent', kind='bar', data=state_categories_df)
g.ax.tick_params(axis='x', rotation=90)

for p in g.ax.patches:
    txt = str(p.get_height().round(1)) + '%'
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

## Retreive the states, where event takes place - Featurize vg_state

In [None]:
all_states = [i[0] for i in fdist_vg_state.items()] 
df_featurize_vg_state = pd.DataFrame(allclaims_df['vg_state'])

for term in all_states :
    df_featurize_vg_state['state_'+term.lower()]=df_featurize_vg_state['vg_state'].apply(lambda x: 1 if term in x else 0)

display(df_featurize_vg_state.head(10))

## Featurize TARIF_BEZ


In [None]:
print('Unique Tarif: ', len(allclaims_df['TARIF_BEZ'].unique()))

In [None]:
flat_list_tarif_desc = [item for item in allclaims_df['TARIF_BEZ']]
fdist_tarif_desc = FreqDist(flat_list_tarif_desc)

In [None]:
most_common_terms_tarif = [i[0] for i in fdist_tarif_desc.most_common(30)] 
df_featurize_tarif = pd.DataFrame(allclaims_df['TARIF_BEZ'])

for term in most_common_terms_tarif :
    df_featurize_tarif['tarif_'+term.lower()]=df_featurize_tarif['TARIF_BEZ'].apply(lambda x: 1 if term in x else 0)

display(df_featurize_tarif.head(10))

In [None]:
fdist_tarif_desc.plot(30,cumulative=False)

### Group the frequent tariffs and rest of the tarifs to 'Other'

In [None]:
df_featurize_tarif['grouped_tarif']=df_featurize_tarif['TARIF_BEZ'].apply(lambda x: 'Selected Tariffs' if x in most_common_terms_tarif else 'Other')

tarif_categories_df = df_featurize_tarif['grouped_tarif'].value_counts(normalize=True)
tarif_categories_df = tarif_categories_df.mul(100).rename('Percent').reset_index()
tarif_categories_df.rename(columns = {'index':'Tarif'}, inplace = True)

g = sns.catplot(x='Tarif', y='Percent', kind='bar', data=tarif_categories_df)

for p in g.ax.patches:
    txt = str(p.get_height().round(1)) + '%'
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

## Anonymization

In [None]:
faker = Faker(['de_DE'])
Faker.seed(0)

### Visualize anonymized Band

In [None]:
flat_list_fake_bands = [item for item in allclaims_df['anonymized_band']]
fdist_fake_band = FreqDist(flat_list_fake_bands)
fdist_fake_band.plot(30,cumulative=False)

## Featurize Promoter

### Clean Promoter values; promoter_clean

In [None]:
df_featurize_promoter = pd.DataFrame(allclaims_df['PROMOTER'])
df_featurize_promoter["promoter_clean"] = allclaims_df['VERANST_NAME']

### Remove repeating company types from 'Promoter'. 
#### Get the frequencies of unique promoter entries until total of 1742197 counts(len of dataframe) is reached. Set remaining promoter entries to 'Other'

In [None]:
def remove_repeating_org_names(column,threshold=0.75,substrings=[],return_categories_list=True):
  #Find the threshold value using the percentage and number of instances in the column
  threshold_value=int(threshold*len(column))
  #Initialise an empty list for our new minimised categories
  categories_list=[]
  #Initialise a variable to calculate the sum of frequencies
  s=0
  #Create a counter dictionary of the form unique_value: frequency
  counts=Counter(column)

  # Loop through the category name and its corresponding frequency after sorting the categories by descending order of frequency
  for i,j in counts.most_common():
    #Add the frequency to the global sum
    s+=dict(counts)[i]
    category_name = i

    for substr in substrings:
      category_name = category_name.replace('K. D. OE. R','K.D.OE.R')
      if category_name.count(substr) == 2:
          category_name = category_name.replace(substr,'X',1) # replace 1st occurance of the string with X
          category_name = category_name.replace(substr,'').strip() # replace 2st occurance of the string with empty
          category_name = category_name.replace('X', substr) # replace X with substring value

    #Append the category name to the list
    categories_list.append(category_name)
    #Check if the global sum has reached the threshold value, if so break the loop
    if s>=threshold_value:
      break

  # Append the category Other to the list
  categories_list.append('Other')

  #Replace all instances not in our new categories by Other  
  new_column=column.apply(lambda x: x if x in categories_list else 'Other')

  
  #Return transformed column and unique values if return_categories=True
  if(return_categories_list):
    return new_column,categories_list
  #Return only the transformed column if return_categories=False
  else:
    return new_column

In [None]:
org_strings = ['GMBH & CO. KG', 'E.V', 'GMBH', 'GBR', 'K.D.OE.R', 'OHG']
tran_new_column,new_cat_list=remove_repeating_org_names(df_featurize_promoter['promoter_clean'],threshold=1.00,substrings=org_strings,return_categories_list=True)

In [None]:
tran_new_column.describe

### Transform the Promoter column to remove repetitive company name

In [None]:
# Remove the repeating organisation(company types) substring from Organizer/Promoter
def transform_promoter(x, substrings):
    str_value = x
    str_value = str_value.replace('K. D. OE. R','K.D.OE.R')

    # y = lambda substrings, str_value: (subs if(str_value.count(subs)==2) else ''  for subs in substrings)
    # result_sub_str = y(substrings, str_value)

    for subs in substrings:
        if str_value.count(subs) == 2:
            str_value = str_value.replace(subs,'X',1) # replace 1st occurance of the string with X
            str_value = str_value.replace(subs,'').strip() # replace 2st occurance of the string with empty
            str_value = str_value.replace('X', subs) # replace X with substring value
            return str_value
        else:
            return str_value

In [None]:
orgs = ['GMBH & CO. KG', 'E.V', 'GMBH', 'GBR', 'K.D.OE.R', 'OHG']

df_featurize_promoter['promoter_transform'] = df_featurize_promoter.apply(lambda x: transform_promoter(x['promoter_clean'], substrings=[orgs[0]]), axis=1)
df_featurize_promoter['promoter_transform'] = df_featurize_promoter.apply(lambda x: transform_promoter(x['promoter_transform'], substrings=[orgs[1]]), axis=1)
df_featurize_promoter['promoter_transform'] = df_featurize_promoter.apply(lambda x: transform_promoter(x['promoter_transform'], substrings=[orgs[2]]), axis=1)
df_featurize_promoter['promoter_transform'] = df_featurize_promoter.apply(lambda x: transform_promoter(x['promoter_transform'], substrings=[orgs[3]]), axis=1)
df_featurize_promoter['promoter_transform'] = df_featurize_promoter.apply(lambda x: transform_promoter(x['promoter_transform'], substrings=[orgs[4]]), axis=1)
df_featurize_promoter['promoter_transform'] = df_featurize_promoter.apply(lambda x: transform_promoter(x['promoter_transform'], substrings=[orgs[5]]), axis=1)

In [None]:
allclaims_df['promoter_transform'] = df_featurize_promoter['promoter_transform']

## Anonymize Promoter

In [None]:
dict_promoters = {promoter: faker.unique.company() for promoter in allclaims_df['promoter_transform'].unique()}
allclaims_df['anonymized_promoter'] = allclaims_df['promoter_transform'].map(dict_promoters)

### Check the distribution of promoter and Anonymized promoter

In [None]:
print('Actual unique promoters: ', len(allclaims_df['promoter_transform'].unique()))
print('Anonymized unique promoters: ', len(allclaims_df['anonymized_promoter'].unique()))

### Visualize Anonymized Promoter

In [None]:
flat_list_fake_promoters = [item for item in allclaims_df['anonymized_promoter']]
fdist_fake_promoter = FreqDist(flat_list_fake_promoters)
fdist_fake_promoter.plot(30,cumulative=False)

### Visualize Actual Promoter

In [None]:
# flat_list_actual_promoters = [item for item in df_featurize_promoter['promoter_transform']]
# fdist_actual_promoter = FreqDist(flat_list_actual_promoters)
# fdist_actual_promoter.plot(30,cumulative=False)

### Group the frequent promoters and rest of the promoters to 'Other'

In [None]:
# most_common_terms_promoter = [i[0] for i in fdist_actual_promoter.most_common(30)]
most_common_terms_promoter = [i[0] for i in fdist_fake_promoter.most_common(30)]


df_featurize_promoter['grouped_promoter']=df_featurize_promoter['promoter_transform'].apply(lambda x: 'Selected Promoters' if x in most_common_terms_promoter else 'Other')

promoter_categories_df = df_featurize_promoter['grouped_promoter'].value_counts(normalize=True)
promoter_categories_df = promoter_categories_df.mul(100).rename('Percent').reset_index()
promoter_categories_df.rename(columns = {'index':'Promoter'}, inplace = True)

g = sns.catplot(x='Promoter', y='Percent', kind='bar', data=promoter_categories_df)

for p in g.ax.patches:
    txt = str(p.get_height().round(1)) + '%'
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

In [None]:
# condition = df_featurize_promoter['promoter_clean']!='Other'

In [None]:
df_featurize_promoter['anonymized_promoter'] = allclaims_df['anonymized_promoter'] 
df_featurize_promoter

In [None]:
# flat_list_clean_promoter = [item for item in df_featurize_promoter[condition]['promoter_clean']]
flat_list_clean_promoter = [item for item in df_featurize_promoter['anonymized_promoter']]
fdist_clean_promoter = FreqDist(flat_list_clean_promoter)
most_common_terms_clean_promoter = [i[0] for i in fdist_clean_promoter.most_common(30)] 


for term in most_common_terms_clean_promoter :
    # df_featurize_promoter['promoter_'+term.lower()]=df_featurize_promoter['promoter_clean'].apply(lambda x: 1 if term in str(x) else 0)
    df_featurize_promoter['promoter_'+term.lower()]=df_featurize_promoter['anonymized_promoter'].apply(lambda x: 1 if term in str(x) else 0)

display(df_featurize_promoter.head(5))

In [None]:
fdist_clean_promoter.plot(30,cumulative=False)

In [None]:
# google maps API
# how big the city is?
# how close the location to the city centre?

## Venues

In [None]:
nlp = spacy.load('C:/Users/sgopalakrish/Miniconda3/Lib/site-packages/de_core_news_sm/de_core_news_sm-3.4.0/')
nlp_en = spacy.load('C:/Users/sgopalakrish/Miniconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-3.4.1')

In [None]:
german_stop_words = nlp.Defaults.stop_words
english_stop_words = nlp_en.Defaults.stop_words

In [None]:
def remove_special_chars_from(stopwords):
    cleaned_stop_words = []
    for each in stopwords:
        each = each.replace('ß','SS')
        each = each.upper()
        each = each.replace('Ä', 'AE')
        each = each.replace('Ö', 'OE')
        each = each.replace('Ü', 'UE')
        
        cleaned_stop_words.append(each)
    return cleaned_stop_words

In [None]:
cleaned_de_stopwords = remove_special_chars_from(german_stop_words)

additional_stopwords = [
    'ST', 'FREIEN', 'BAD', 'HAUS', 'EV', 'BERLIN', 'KATH', 'S', 'HOF', 'ALTE', 'MITTE', 'LUTH', 'MUENCHEN',
    'IRISH', 'MUSIK', 'KULTUR', 'FUER', 'EVANG', 'MARITIM', 'KOELN', 'U', 'TURN', 'E', 'STUTTGART', 'ALTES',
    'A', 'GASTES', 'THE', 'EUROPA', 'HANNOVER', 'STADT', 'BADEN', 'NUERNBERG', 'HAMBURG', 'NEUE',
    'EVANGELISCHE', 'LEIPZIG', 'B', 'DRESDEN', 'BREMEN', 'PETER', '1','ALTER', 'AM', 'DIE', 'DER',
    'DAS', 'DES', 'DEN', 'DEM', 'EIN', 'EINER', 'EINEM', 'EINES', 'EINE',
    'MEIN', 'MEINER', 'MEINES', 'MEINEM', 'MEINE', 'UND'
]


In [None]:
en_stop_words = []
for each in english_stop_words:
    en_stop_words.append(each)

In [None]:
all_stopwords = cleaned_de_stopwords + additional_stopwords + en_stop_words
all_stopwords = [x.lower() for x in all_stopwords]
# all_stopwords = list(map(lambda x: x.lower(), all_stopwords))

In [None]:
# Find all the categories of VG_RAUM
allclaims_df['VG_RAUM_clean']=allclaims_df['VG_RAUM'].astype(str).fillna('').map(lambda x: re.sub(r'\W+', ' ', x))

In [None]:
room_list_wo_stopwords = []
for each_room in allclaims_df['VG_RAUM_clean'].tolist():
    for each_stopword in all_stopwords:
        # Remove stopword from each row in VG_RAUM_clean 
        each_room.replace(each_stopword,'')
        
    if 'KIRCHE' in each_room:
        room_list_wo_stopwords.append('KIRCHE')
    else:
        room_list_wo_stopwords.append(each_room)

In [None]:
room_list_wo_stopwords = [x.lower() for x in room_list_wo_stopwords]

In [None]:
allclaims_df['VG_RAUM_WO_STOPWORDS'] = room_list_wo_stopwords

In [None]:
# Stemming - removes suffixes and prefixes from word roots, 
# Lemmatization - maps the remaining root forms (which may not always be proper words) back to an actual word that occurs in natural language.

Get keywords using spacy

In [None]:
def get_keywords(text):
    result = []
    pos_tag = ['PROPN', 'NOUN', 'VERB']
    for token in text:
        if(token.text in all_stopwords or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text) 
    return result

In [None]:
raum_cleaned_df = (row.VG_RAUM_WO_STOPWORDS for row in allclaims_df.itertuples())

vg_raum_keywords = []
for each_object in nlp.pipe(raum_cleaned_df):
  vg_raum_keywords.append(get_keywords(each_object))

In [None]:
allclaims_df['VG_RAUM_KEYWORDS'] = vg_raum_keywords

## Featurize VG_RAUM

In [None]:
flat_list_vg_raum_keywords = [item for sublist in allclaims_df['VG_RAUM_KEYWORDS'] for item in sublist]
fdist_vg_raum_keywords = FreqDist(flat_list_vg_raum_keywords)
most_common_terms_vg_raum = [i[0] for i in fdist_vg_raum_keywords.most_common(30)]

In [None]:
df_featurize_vg_raum_keywords = pd.DataFrame(allclaims_df['VG_RAUM_KEYWORDS'])

for term in most_common_terms_vg_raum :
    df_featurize_vg_raum_keywords['place_'+term]=df_featurize_vg_raum_keywords['VG_RAUM_KEYWORDS'].apply(lambda x: 1 if term in x else 0)

display(df_featurize_vg_raum_keywords.head(10))

In [None]:
fdist_vg_raum_keywords.plot(30,cumulative=False)

In [None]:
display(allclaims_df.head())

### Group the frequent venues and rest of the venues to 'Other'

In [None]:
# most_common_terms_vg_raum = [i[0] for i in fdist_vg_raum_keywords.most_common(200)]

list_venue_wo_stopwords = [item for item in allclaims_df['VG_RAUM_WO_STOPWORDS']]
fdist_venue = FreqDist(list_venue_wo_stopwords)
most_common_terms_vg_raum = [i[0] for i in fdist_venue.most_common(2000)]

In [None]:
# for term in most_common_terms_vg_raum :
    # df_featurize_vg_raum_keywords['grouped_venue']=df_featurize_vg_raum_keywords['VG_RAUM_KEYWORDS'].apply(lambda x: 'Selected Venues' if term in x else 'Other')

df_featurize_vg_raum_keywords['grouped_venue']=allclaims_df['VG_RAUM_WO_STOPWORDS'].apply(lambda x: 'Selected Venues' if x in most_common_terms_vg_raum else 'Other') 


venue_categories_df = df_featurize_vg_raum_keywords['grouped_venue'].value_counts(normalize=True)
venue_categories_df = venue_categories_df.mul(100).rename('Percent').reset_index()
venue_categories_df.rename(columns = {'index':'Venue'}, inplace = True)

g = sns.catplot(x='Venue', y='Percent', kind='bar', data=venue_categories_df)

for p in g.ax.patches:
    txt = str(p.get_height().round(1)) + '%'
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

## Featurize BAND

### Anonymized BAND

In [None]:
dict_bands = {band: faker.unique.company() for band in allclaims_df['BAND'].unique()}
allclaims_df['anonymized_band'] = allclaims_df['BAND'].map(dict_bands)

### Check the distribution of actual band & anonymized band

In [None]:
print('Actual unique bands: ', len(allclaims_df['BAND'].unique()))
print('Anonymized unique bands: ', len(allclaims_df['anonymized_band'].unique()))

### Visualize anonymized Band

In [None]:
flat_list_fake_bands = [item for item in allclaims_df['anonymized_band']]
fdist_fake_band = FreqDist(flat_list_fake_bands)
fdist_fake_band.plot(30,cumulative=False)

In [None]:
# flat_list_band = [item for item in allclaims_df['BAND']]
flat_list_band = [item for item in allclaims_df['anonymized_band']]
fdist_band_desc = FreqDist(flat_list_band)
most_common_terms_band = [i[0] for i in fdist_band_desc.most_common(30)] 
# df_featurize_band = pd.DataFrame(allclaims_df['BAND'])
df_featurize_band = pd.DataFrame(allclaims_df['anonymized_band'])

In [None]:
for term in most_common_terms_band :
    # df_featurize_band['band_'+term.lower()]=df_featurize_band['BAND'].apply(lambda x: 1 if term in x else 0)
    df_featurize_band['band_'+term.lower()]=df_featurize_band['anonymized_band'].apply(lambda x: 1 if term in x else 0)

display(df_featurize_band.head(10))

In [None]:
fdist_band_desc.plot(30,cumulative=False)

### Group the frequent bands and rest of the bands to 'Other'

In [None]:
most_common_terms_band = [i[0] for i in fdist_band_desc.most_common(50)] 

In [None]:
df_featurize_band['grouped_band']=allclaims_df['BAND'].apply(lambda x: 'Selected Bands' if x in most_common_terms_band else 'Other')

band_categories_df = df_featurize_band['grouped_band'].value_counts(normalize=True)
band_categories_df = band_categories_df.mul(100).rename('Percent').reset_index()
band_categories_df.rename(columns = {'index':'Band'}, inplace = True)

g = sns.catplot(x='Band', y='Percent', kind='bar', data=band_categories_df)

for p in g.ax.patches:
    txt = str(p.get_height().round(1)) + '%'
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

## VERANST_SEGMENT

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Convert the VERANST_SEGMENT values from float to Int
allclaims_df.VERANST_SEGMENT = allclaims_df['VERANST_SEGMENT'].astype(int)

In [None]:
le = LabelEncoder()
allclaims_df['VERANST_SEGMENT'] = le.fit_transform(allclaims_df['VERANST_SEGMENT'])

## VG_DATUM

In [None]:
allclaims_df['VG_DATUM_YEAR'] = le.fit_transform(allclaims_df['VG_DATUM_VON'].dt.year)
allclaims_df['VG_DATUM_MONTH'] = le.fit_transform(allclaims_df['VG_DATUM_VON'].dt.month)
allclaims_df['VG_DATUM_DAY_OF_WEEK']= le.fit_transform(allclaims_df['VG_DATUM_VON'].dt.dayofweek)

seasons = [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 0]
claim_season_list = [seasons[item] for item in allclaims_df['VG_DATUM_MONTH']]

allclaims_df['VG_DATUM_SEASON'] = claim_season_list

## Correlation Heatmap

### Define a new dataframe corr_df1, that contains only venue details

In [None]:
# Replace the list of keywords with most frequest keyword
def transform_venue(x, venue):
    venue_keywords = x

    if venue in venue_keywords:
        return venue
    else:
        return venue_keywords

In [None]:
corr_df1 = allclaims_df[['VG_RAUM_KEYWORDS']].copy()

In [None]:
venue_strs = []
for keyword_list in corr_df1['VG_RAUM_KEYWORDS'].values:
    if len(keyword_list) == 0:
        venue_strs.append('')
    else:
        ad = ' '.join(e for e in keyword_list)
        venue_strs.append(ad)

In [None]:
corr_df1['venue'] = venue_strs

In [None]:
# ['kirche', 'hotel', 'cafe', 'theater', 'club', 'halle', 'gaststaette', 'festhalle', 'kulturzentrum', 'festzelt', 
# 'schloss', 'pub', 'stadthalle', 'park', 'gasthof', 'kabarett', 'arena', 'schlachthof', 'wandelhalle', 'turnhalle', 
# 'buergerhaus', 'museum', 'rathaus', 'staatsbad', 'zelt', 'jazz', 'forum', 'gymnasium', 'schule', 'sporthalle']

corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue'], 'kirche'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'hotel'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'cafe'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'theater'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'club'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'halle'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'gaststaette'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'festhalle'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'kulturzentrum'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'festzelt'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'schloss'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'pub'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'stadthalle'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'park'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'gasthof'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'kabarett'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'arena'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'schlachthof'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'wandelhalle'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'turnhalle'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'buergerhaus'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'museum'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'rathaus'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'staatsbad'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'zelt'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'jazz'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'forum'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'gymnasium'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'schule'), axis=1)
corr_df1['venue_clean'] = corr_df1.apply(lambda x: transform_venue(x['venue_clean'], 'sporthalle'), axis=1)

### Extract the selected features into a new dataframe corr_df, to visualize the correlation heatmap

In [None]:
feat_columns = ['VG_DATUM_YEAR','VG_DATUM_MONTH', 'VG_DATUM_DAY_OF_WEEK', 'VG_DATUM_SEASON', 'VERANST_SEGMENT', 
            'BAND', 'TARIF_BEZ', 'VG_RAUM_KEYWORDS', 'promoter_transform', 'vg_state']


corr_df = allclaims_df[feat_columns].copy()
corr_df['BAND'] = le.fit_transform(corr_df['BAND'])
corr_df['TARIF_BEZ'] = le.fit_transform(corr_df['TARIF_BEZ'])
corr_df['promoter_transform'] = le.fit_transform(corr_df['promoter_transform'])
corr_df['vg_state'] = le.fit_transform(corr_df['vg_state'])

corr_df['venue'] = corr_df1['venue_clean'].values 
corr_df['venue'] = le.fit_transform(corr_df['venue'])

feat_columns.append('venue')

In [None]:
corr_df=corr_df.rename(str.lower, axis='columns')

In [None]:
# Correlation between features
lower_feat_columns = [each.lower() for each in feat_columns]

corr = corr_df[lower_feat_columns].corr()
f, ax = plt.subplots(figsize=(8,6))

# Configure a custom diverging colormap
# cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, annot=True)

## Merge all the features

In [None]:
allclaims_feature_df = allclaims_df.merge(df_featurize_vg_raum_keywords, how='left', on='ID')
allclaims_feature_df = allclaims_feature_df.merge(df_featurize_tarif, how='left', on='ID')
allclaims_feature_df = allclaims_feature_df.merge(df_featurize_vg_state, how='left', on='ID')
allclaims_feature_df = allclaims_feature_df.merge(df_featurize_band, how='left', on='ID')
allclaims_feature_df = allclaims_feature_df.merge(df_featurize_promoter, how='left', on='ID')

In [None]:
allclaims_feature_df=allclaims_feature_df.rename(str.lower, axis='columns')

In [None]:
display(allclaims_feature_df.head(5))

In [None]:
#allclaims_feature_df.to_pickle('./data/export_features_2016_2020_v1.pkl.bz2', protocol=4)

In [None]:
# ArrowTypeError: ("Expected bytes, got a 'float' object", 'Conversion failed for column nutzliznehm_plz with type object')
allclaims_feature_df = allclaims_feature_df.drop('nutzliznehm_plz', axis=1)

In [None]:
# allclaims_feature_df.to_parquet('./data/export_features_2016_2020_v2.parquet.gzip',compression='gzip')

In [None]:
allclaims_feature_df.to_parquet('./data/export_features_2016_2020_v3.parquet.gzip',compression='gzip')

In [None]:
##############################################

## Get Essential anonymized columns

In [None]:
anony_columns = ['VG_RAUM_KEYWORDS', 'VG_RAUM_WO_STOPWORDS', 'VG_RAUM', 'anonymized_band', 'anonymized_promoter',
'TARIF_BEZ', 'vg_state', 'VG_ORT',
'VG_DATUM_SEASON', 'VG_DATUM_MONTH', 'VG_DATUM_DAY_OF_WEEK', 'VG_DATUM_YEAR',
'VG_DATUM_VON', 'VERANST_SEGMENT', 'VG_INKASSO']
anonymized_essential_df =  allclaims_df[anony_columns].copy()

In [None]:
anonymized_essential_df=anonymized_essential_df.rename(str.lower, axis='columns')

In [None]:
anonymized_essential_df.to_parquet('./data/export_anonymized_features_2016_2020.parquet.gzip',compression='gzip')

In [None]:
anony_df = pd.read_parquet('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_anonymized_features_2016_2020.parquet.gzip')
display(anony_df.head())
print(anony_df.shape)

## Extract venue keywords

In [None]:
def get_raum_keywords(text):
    result = []
    pos_tag = ['PROPN', 'ADV', 'NOUN', 'VERB', 'ORG', 'PER']
    for token in text:
        print(token.pos_)
        if(token.text in all_stopwords or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text) 
    return result

In [None]:
raum_clean = (row.vg_raum_wo_stopwords for row in anony_df.itertuples())

raum_keywords = []
for each_object in nlp.pipe(raum_clean):
  raum_keywords.append(get_raum_keywords(each_object))


In [None]:
# ['kirche', 'hotel', 'cafe', 'theater', 'club', 'halle', 'gaststaette', 'festhalle', 'kulturzentrum', 'festzelt', 
# 'schloss', 'pub', 'stadthalle', 'park', 'gasthof', 'kabarett', 'arena', 'schlachthof', 'wandelhalle', 'turnhalle', 
# 'buergerhaus', 'museum', 'rathaus', 'staatsbad', 'zelt', 'jazz', 'forum', 'gymnasium', 'schule', 'sporthalle']
anony_df['vg_raum_new_keywords'] = raum_keywords

In [None]:
flat_list_raum_keywords = [item for sublist in anony_df['vg_raum_new_keywords'] for item in sublist]
fdist_raum_keywordss = FreqDist(flat_list_raum_keywords)
most_common_terms_raum = [i[0] for i in fdist_raum_keywordss.most_common(100)]
print(most_common_terms_raum)

In [None]:
# Replace the list of keywords with most frequest keyword
def transform_venue_with_count(x, venue):
    venue_keywords = x
    venue_keywords_list = venue_keywords.split(' ')
    
    if venue_keywords_list.count(venue) > 0:
        return venue
    # if venue in venue_keywords:
    #         return venue
    else:
        return venue_keywords

In [None]:
# Define a new 'venue'
venue_keyword_strs = []
for keyword_list in anony_df['vg_raum_new_keywords'].values:
    if len(keyword_list) == 0:
        venue_keyword_strs.append('')
    else:
        ad = ' '.join(e for e in keyword_list)
        venue_keyword_strs.append(ad)

anony_df['venue'] = venue_keyword_strs

In [None]:
# Replace '' empty venues with vg_raum_wo_stopwords values
empty_venue_condition = anony_df['venue'] == ''
anony_df.loc[empty_venue_condition, 'venue'] = anony_df[empty_venue_condition]['vg_raum_wo_stopwords']

In [None]:
hundred_common_venues = most_common_terms_raum

In [None]:
# Common venues without specific location
eighty_seven_common_venues = ['kirche', 'hotel', 'cafe', 'theater', 'club', 'halle', 
'kulturzentrum', 'gaststaette', 'buergerhaus', 'festhalle', 'stadthalle', 'festzelt', 
'schloss', 'pub', 'restaurant', 'gasthaus', 'bar', 'kurhaus', 
'kulturhaus', 'kabarett', 'rathaus', 'arena', 'gasthof', 'park', 
'wandelhalle', 'schlachthof', 'turnhalle', 'staatsbad', 'zelt', 'mehrzweckhalle', 
'museum', 'zentrum', 'forum', 'gymnasium', 'gemeindehalle', 
'saal', 'grundschule', 'sporthalle', 'musikschule', 'schule', 'gemeindehaus', 
'circus', 'jugendzentrum', 'haus des gastes', 'dorfgemeinschaftshaus', 'fabrik', 'landgasthof', 
'live', 'gop', 'messe', 'hofbraeuhaus', 'schuetzenhaus', 'bereich', 'jazzclub', 
'jazz', 'buergerzentrum', 'burg', 'center', 'sachs', 'galerie', 
'kurpark', 'weingut', 'wirtshaus', 'werk', 'brauhaus', 'freizeitzentrum', 
'bistro', 'feierwerk', 'backstage', 'ms', 'stadttheater', 'kulturcafe', 'buergersaal', 'sport', 'villa', 
'bahnhof', 'sportheim', 'brauerei', 'kulturfabrik', 'jugend', 
'kantine', 'music', 'parkhotel', 'scheune', 'woerishofen', 'markthalle', 'knust']

In [None]:
anony_df['venue_clean'] = anony_df.apply(lambda x: transform_venue_with_count(x['venue'], 'kirche'), axis=1)

for i in range(1,len(hundred_common_venues)):
    venue = hundred_common_venues[i]
    anony_df['venue_clean'] = anony_df.apply(lambda x: transform_venue_with_count(x['venue_clean'], venue), axis=1)

## Anonymize Venue

In [None]:
dict_venues = {venue: faker.unique.street_name() if venue not in eighty_seven_common_venues else venue for venue in anony_df['venue_clean'].unique()}
anony_df['anonymized_venue'] = anony_df['venue_clean'].map(dict_venues)

### Check the distribution of actual venue & anonymized venue

In [None]:
print('Actual unique venue: ', len(anony_df['venue_clean'].unique()))
print('Anonymized unique venue: ', len(anony_df['anonymized_venue'].unique()))

In [None]:
display(anony_df.head())

In [None]:
# anony_essential_columns = ['anonymized_band', 'anonymized_promoter', 'anonymized_venue',
# 'tarif_bez', 'vg_state', 'vg_ort',
# 'vg_datum_season', 'vg_datum_month', 'vg_datum_day_of_week', 'vg_datum_year',
# 'vg_datum_von', 'veranst_segment', 'vg_inkasso']
anony_essential_columns = ['anonymized_band', 'anonymized_promoter', 'anonymized_venue',
'tarif_bez', 'vg_state',
'vg_datum_season', 'vg_datum_month', 'vg_datum_day_of_week', 'vg_datum_year',
'vg_datum_von', 'veranst_segment', 'vg_inkasso']
anony_essential_df =  anony_df[anony_essential_columns].copy()
display(anony_essential_df.head())

In [None]:
anony_essential_df.to_parquet('./data/export_anonymized_features_2016_2020.parquet.gzip',compression='gzip')

In [None]:
######################

In [None]:
target_encoded_df = pd.read_parquet('C:/Users/sgopalakrish/Downloads/intellizenz-model-training/data/export_features_2016_2020_v5.parquet.gzip')
display(target_encoded_df.head())
print(target_encoded_df.shape)

## Baseline SVC Classifier models to classify event segments

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
X = allclaims_feature_df[features]
y = allclaims_feature_df['veranst_segment']

In [None]:
print(allclaims_df['VG_RAUM_KEYWORDS'].dtypes)
print(allclaims_df['VG_DATUM_VON'].dtypes)
print(allclaims_df['VG_ORT'].dtypes)
print(allclaims_df['TARIF_BEZ'].dtypes)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## XGBoost Classifier

In [None]:
pip install xgboost

In [None]:
import xgboost

xgboost_classifier = xgboost.XGBClassifier()
xgboost_classifier.fit(X_train, y_train)

In [None]:
# save the model to disk
xbg_filename = 'xgb_classifier_model.sav'
pickle.dump(xgboost_classifier, open(xbg_filename, 'wb'))

In [None]:
y_pred = xgboost_classifier.predict(X_test)

Evaluation

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

xgboost_accuracy = accuracy_score(y_test, y_pred, normalize=False)
print(xgboost_accuracy)

SVC Classifier

In [None]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

In [None]:
# save the model to disk
filename = 'svc_classifier_model.sav'
pickle.dump(svclassifier, open(filename, 'wb'))

In [None]:
y_pred = svclassifier.predict(X_test)

Evaluation

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

svc_accuracy = accuracy_score(y_test, y_pred, normalize=False)
print(svc_accuracy)