In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score


In [62]:
df = pd.read_csv('data.csv', sep=';')

In [63]:
# seperate into features and labels
relevant_columns = ['Auftragskonto', 'Buchungstag', 'Valutadatum', 'Buchungstext', 'Verwendungszweck', 'Beguenstigter/Zahlungspflichtiger', 'Kontonummer', 'BLZ',
                    'Betrag', 'Waehrung']
labels = df['label']
data = df[relevant_columns]

# preprocess data
# fill NaN values with 0s
data['Auftragskonto'] = data['Auftragskonto'].fillna(0).astype(str)

# remove all non-alphabetic values from 'Verwendungszweck' since we than only have the relevant info
data['Verwendungszweck'] = data['Verwendungszweck'].transform(lambda elem: ''.join(filter(str.isalpha, str(elem))))

data.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Auftragskonto'] = data['Auftragskonto'].fillna(0).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Verwendungszweck'] = data['Verwendungszweck'].transform(lambda elem: ''.join(filter(str.isalpha, str(elem))))


Unnamed: 0,Auftragskonto,Buchungstag,Valutadatum,Buchungstext,Verwendungszweck,Beguenstigter/Zahlungspflichtiger,Kontonummer,BLZ,Betrag,Waehrung
count,209.0,209,209,209,209,209,208,208,209.0,209
unique,3.0,85,85,14,67,59,53,40,94.0,1
top,89990210.0,02.02.2016,02.02.2016,Lastschrift (Einzugsermächtigung),VisaEur,Kartenzahlung,922904916,25190001,2000.0,EUR
freq,120.0,8,8,94,23,17,34,49,13.0,209


In [64]:
# choose only columns that give high-accuracy. See also last section for this
columns_of_interest = ['Buchungstext', 'BLZ', 'Betrag']
data_of_interest = data[columns_of_interest]

In [65]:
def classify(input_data):
    # feature-extraction
    features = pd.get_dummies(input_data)

    # split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=1)

    # fit our classifier to training data and predict labels for test data
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)

    return X_train, X_test, y_pred, y_test


# print('Number of rows in the total set: {}'.format(df.shape[0]))
# print('Number of rows in the training set: {}'.format(X_train.shape[0]))
# print('Number of rows in the test set: {}'.format(X_test.shape[0]))

X_train, X_test, y_pred, y_test = classify(data_of_interest)
print('Accuracy score: ', format(accuracy_score(y_test, y_pred)))
print('F1 score: ', format(f1_score(y_test, y_pred, average='micro')))


Accuracy score:  0.9433962264150944
F1 score:  0.9433962264150944


In [66]:
## This part classifies the data for every combination of columns and prints the combination with the highest accuracy.
## Thus we have proven that columns_of_interest is indeed the best choice of columns.

import itertools

# returns every combination of column_names for every length without considering order
def get_all_combinations(column_names):
    res = []
    for i in range(1, len(column_names) + 1):
        combinations = list(itertools.combinations(column_names, i))
        combinations = [list(c) for c in combinations]
        res.append(combinations)
    return list(itertools.chain.from_iterable(res))

column_combinations = get_all_combinations(relevant_columns)
acc = []
for col in column_combinations:
    X_train, X_test, y_pred, y_test = classify(data[col])
    acc.append(accuracy_score(y_pred, y_test))

# get the maximum accuracy of all predictions
max_acc = max(acc)

# get the combination of columns that gives the highest accuracy
max_col = acc.index(max_acc)
print(column_combinations[max_col])


  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /


['Buchungstext', 'BLZ', 'Betrag']
