In [61]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from scipy import stats
import seaborn as sns
from scipy.stats import norm



In [62]:
df = pd.read_csv('data.csv', sep=';')

In [63]:
# filter only interesting data and seperate into features and labels
columns_of_interest = ['Buchungstext', 'Verwendungszweck', 'Beguenstigter/Zahlungspflichtiger',
                       'Kontonummer', 'BLZ', 'Betrag']
# GaussianNB: Betrag, BLZ (better MultinomialNB), Buchungstext (only Precision)
# MultinomialNB: only BLZ and Buchungstext make difference
# BernoulliNB: Betrag, BLZ, Kontonummer, Beguenstigter/Zahlungspflichtiger, Verwendungszweck (much)
features = df[columns_of_interest]
labels = df['label']

features_splitted = [df[['Buchungstext', 'BLZ', 'Betrag']], df[['Buchungstext', 'Beguenstigter/Zahlungspflichtiger',
                       'Kontonummer','BLZ']],
            df[['Verwendungszweck', 'Beguenstigter/Zahlungspflichtiger', 'Kontonummer', 'BLZ', 'Betrag']]]


In [64]:
def dates_to_days(dates):
    dates = dates.transform(lambda elem: list(map(int, elem.split('.')))[0])
    return dates

#features['Buchungstag'] = dates_to_days(df['Buchungstag']) - dates_to_days(df['Valutadatum'])
#features['Betrag'] = features['Betrag'].transform(lambda elem: float(elem.replace(',', '.')))
#features['Verwendungszweck'] = features['Verwendungszweck'].transform(lambda elem: ''.join(filter(str.isalpha, str(elem))))

#columns_to_stringify = ['Kontonummer', 'BLZ', 'Betrag']

#for col in columns_to_stringify:
#features[col] = features[col].transform(lambda elem: str(elem))

#le = preprocessing.LabelEncoder()
#for col in columns_of_interest:
#features.loc[:, col] = le.fit_transform(features[col])
#features[col] = features[col].transform(lambda elem: int(elem) + 1)
#features[col], _ = stats.boxcox(features[col])
#sns.distplot(features[col], fit=norm, kde=False)

In [65]:
features = pd.get_dummies(features)
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=1)

features_splitted = [pd.get_dummies(features) for features in features_splitted]
X_train_gnb, X_test_gnb, y_train_gnb, y_test_gnb = train_test_split(features_splitted[0], labels, random_state=1)
X_train_mnb, X_test_mnb, y_train_mnb, y_test_mnb = train_test_split(features_splitted[1], labels, random_state=1)
X_train_bnb, X_test_bnb, y_train_bnb, y_test_bnb = train_test_split(features_splitted[2], labels, random_state=1)


In [66]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

gnb.fit(X_train_gnb, y_train_gnb)
mnb.fit(X_train_mnb, y_train_mnb)
bnb.fit(X_train_bnb, y_train_bnb)

y_pred_gnb = gnb.predict(X_test_gnb)
print('Accuracy score: ', format(accuracy_score(y_test_gnb, y_pred_gnb)))
print('Precision score: ', format(precision_score(y_test, y_pred_gnb, average='macro')))
print('Recall score: ', format(recall_score(y_test, y_pred_gnb, average='macro')))
print('F1 score: ', format(f1_score(y_test, y_pred_gnb, average='macro')), '\n')

y_pred_mnb = mnb.predict(X_test_mnb)
print('Accuracy score: ', format(accuracy_score(y_test_mnb, y_pred_mnb)))
print('Precision score: ', format(precision_score(y_test, y_pred_mnb, average='macro')))
print('Recall score: ', format(recall_score(y_test, y_pred_mnb, average='macro')))
print('F1 score: ', format(f1_score(y_test, y_pred_mnb, average='macro')), '\n')

y_pred_bnb = bnb.predict(X_test_bnb)
print('Accuracy score: ', format(accuracy_score(y_test_bnb, y_pred_bnb)))
print('Precision score: ', format(precision_score(y_test, y_pred_bnb, average='macro')))
print('Recall score: ', format(recall_score(y_test, y_pred_bnb, average='macro')))
print('F1 score: ', format(f1_score(y_test, y_pred_bnb, average='macro')), '\n')


estimators = [('gnb', gnb), ('mnb', mnb), ('bnb', bnb)]
clf = StackingClassifier(estimators=estimators, final_estimator=gnb)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy score: ', format(accuracy_score(y_test, y_pred)))
print('Precision score: ', format(precision_score(y_test, y_pred, average='macro')))
print('Recall score: ', format(recall_score(y_test, y_pred, average='macro')))
print('F1 score: ', format(f1_score(y_test, y_pred, average='macro')))

Accuracy score:  0.9433962264150944
Precision score:  0.9315476190476191
Recall score:  0.9615384615384616
F1 score:  0.9393719806763285 

Accuracy score:  0.9433962264150944
Precision score:  0.9553571428571429
Recall score:  0.941941391941392
F1 score:  0.945679012345679 

Accuracy score:  0.6981132075471698
Precision score:  0.5396825396825397
Recall score:  0.6214285714285714
F1 score:  0.5660753190164954 

Accuracy score:  0.9622641509433962
Precision score:  0.9663461538461539
Recall score:  0.9538461538461539
F1 score:  0.9575498575498576


  _warn_prf(average, modifier, msg_start, len(result))
