In [93]:
import tabula
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# read in pdf
df = tabula.read_pdf('data/giro_2019.pdf', pages = 'all')

# inspect table
print(df.columns)
print(df.shape)

# move header to first row
df.loc[-1] = df.columns # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 

# rename columns
df.columns = ["date", "subject", "account", "amount"]

# get NA columns
na_index  = (df["amount"].isna().index[df["amount"].isna() == False].tolist())

# concatenate subjects which actually belong together
s = " "
for i in range(0, len(na_index) - 1):
    df.iloc[na_index[i], 1] = s.join(df.iloc[range(na_index[i], na_index[i+1]), 1])

# remove rows with NA
df = df[pd.notnull(df['amount'])]

# replace NULL with 0
df = df.fillna('0,0')

# convert to numerics
for i in range(0, df.shape[0]):
    df.iloc[i, 3] = df.iloc[i, 3].replace('.', '')
    df.iloc[i, 3] = float(df.iloc[i, 3].replace(',', '.'))
    
# drop incomes
df = df[df['amount'] < 0]

# drop credit card settlements
df = df[df['account'] != '9003290294']

Index(['22.08.2019', 'ERSTLASTSCHRIFT', 'DE67 7005 0000 0003 0240 50',
       '-51,92'],
      dtype='object')
(202, 4)


In [115]:
# cluster dictionary
cluster = {
    'grocery': ['NAHKAUF', 'EDEKA'],
    'travel':     ['ARAL', 'DB', 'BP', 'HMS HUETTEN-MIET', 'ZVKDZ', 'PP.9048.PP'],
    'cash':    ['hypo'],
    'debt': ['Bundeskasse'],
    'run_costs': ['E-Plus', 'ALLIANZ AG', 'M-net', 'HE-1008-2831', 'ALLIANZ DEUTSCHLAND'],
    'shopping': ['AMAZON', 'STRATIC', 'MINIT'],
    'property': ['Abfallwirtschaftsverband', 'KAMINKEHRERMEISTER', 'Schmutzwassergeb.'],
    'actions': ['Kochkurs']
}

df['cluster'] = ''

# assign bookings to cluster
for x in df['subject']:
    for y in cluster:
        for z in cluster[y]:
            if (z in x):
                df.loc[df['subject'] == x, 'cluster'] = y
                
# show items not assigned
if len(df.loc[df['cluster'] == '', 'subject']) == 0:
    print('all items assigned')
else:
    print(df.loc[df['cluster'] == '', 'subject'])
    
df.loc[df['cluster'] == '', 'cluster'] = 'other'

31                                           Lastschrift
110    Lastschrift Stadt Simbach a. Inn 3758 .FAD 375...
119                            Kartenzahlung/-abrechnung
159                                          Überweisung
202                                          Lastschrift
Name: subject, dtype: object


In [116]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = df['cluster'], 
    y = df['amount'], 
    hovertext = df['subject'], 
    hoverinfo = "text", 
    marker_color = 'rgb(158, 202, 225)', 
    marker_line_color = 'rgb(8, 48, 107)',
    marker_line_width = 1.5, 
    opacity = 0.6
))

fig.update_layout(title_text = 'Expenses')


fig.show()

In [120]:
# read in pdf
df = tabula.read_pdf('data/giro_2019_nofooter.pdf', pages = 'all', stream = True)
df

Unnamed: 0,22.08.2019,Lastschrift,DE67 7005 0000 0003 0240 50,"-51,92"
0,22.08.2019,VERSICHERUNGSKAMME,BYLADEMMXXX,
1,,R BAYERN,,
2,,VERSICHERUNGSANSTAL,,
3,,T D.Ö.R.,,
4,,HAFT UNF HE-1008-2831,,
5,,14.08.2019,,
6,,ABWA+Versicherungskamm,,
7,,er Bayern,,
8,22.08.2019,Überweisung,DE90 7005 4306 0000 1271 00,-1500
9,22.08.2019,ZVKDZ OBERLAND,BYLADEM1WOR,
