This example is a rip-off of the official documentation : [Working With Text Data — scikit-learn 0.20.2 documentation](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#tokenizing-text-with-scikit-learn)

In [1]:
# %load https://gist.githubusercontent.com/kidpixo/2ec078d09834b5aa7869/raw/350f79fe4f2e09592404a76db18dcc01a961444b/ipython_inizialization.py
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth',120)
pd.set_option('display.max_rows',300)

In [2]:
training_data = pd.read_csv('training.csv',sep=',')
unknown_data = pd.read_csv('unknown.csv',sep=',')

In [3]:
cols = ['Empfänger/Zahlungspflichtiger','Kategorie 1'] # all the columns relevant to this analysis
text_col = ['Empfänger/Zahlungspflichtiger']           # the "data" column
target_col = ['Kategorie 1']                           # the target column

print('training_data : {}, unknown_data : {} '.format(training_data.shape[0],unknown_data.shape[0]))
print('Features : ', text_col)
print('Target : ', target_col)

training_data : 236, unknown_data : 186 
Features :  ['Empfänger/Zahlungspflichtiger']
Target :  ['Kategorie 1']


In [4]:
training_data[cols].sample(5)

Unnamed: 0,Empfänger/Zahlungspflichtiger,Kategorie 1
213,HOTEL HELVETIA ALBERGO GMBH,Urlaub
223,"PayPal (Europe) S.a.r.l. etCie., S.C.A.",Kleidung
124,RESTAURANT QADMOUS,Freizeit
130,BASLER LV AG,Versicherung
147,Dr. Sebastian Briem Dr. Astrid Aretz,Lebensmittel


In [5]:
unknown_data[cols].sample()

Unnamed: 0,Empfänger/Zahlungspflichtiger,Kategorie 1
127,MUJI DEUTSCHLAND GMBH,


In [6]:
# Extracting features from text files
## Tokenizing text with scikit-learn

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

`extraxt_str` is a dumb attempt of cutting out some recurrent stuff in text fields, like receipt/order number, recurring not significative text and so on. 

It should be properly implemented via [Natural Language Toolkit (NLTK)](https://www.nltk.org/) :

* [Stemming - Wikipedia](https://en.wikipedia.org/wiki/Stemming)
* [NLP Tutorial Using Python NLTK (Simple Examples) - DZone AI](https://dzone.com/articles/nlp-tutorial-using-python-nltk-simple-examples)
* [https://www.nltk.org/book/ch03.html](https://www.nltk.org/book/ch03.html)

In [7]:
import re
def extraxt_str(text):
    """ Given an input pandas Dataframe use it like:
       training_data_text = training_data.[text_col.apply(extraxt_str)).values[:,0]
       to parse the input text
    """
    text =  re.sub(r'[0-9]+', ' ', text)
    text =  re.sub(r'Basislastschrift', '', text)
    text =  re.sub(r'[/\+\.:]+', ' ', text)
    text =  re.sub(r'IBAN', '', text)
    text =  re.sub(r'BIC', '', text)
    text =  re.sub(r'FPIN', '', text)
    text =  re.sub(r'REF', '', text)
    text =  re.sub(r'CICC', '', text)
    text =  re.sub(r'ZZZ', '', text)
    text =  re.sub(r'\s+', ' ', text)    
    return text 

In [8]:
training_data_text = training_data.dropna(subset=text_col)[text_col].values[:,0]
# (training_data.dropna(subset=text_col)[text_col[0]]+' '+ training_data.dropna(subset=text_col)[text_col[1]].apply(extraxt_str)).values.tolist()
training_data_target = training_data.dropna(subset=text_col)[target_col].values[:,0]

In [9]:
# From occurrences to frequencies

training_data_count_vec = count_vect.fit_transform(training_data_text)

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(training_data_count_vec)

#### Model 1 : Multinomial Naive Bayes > https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, training_data_target)

#### Model 2 : SGDClassifier > https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# from sklearn.linear_model import SGDClassifier
# clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None).fit(X_train_tfidf, training_data_target)

In [10]:
#Training a classifier

# define new data to classify
docs_new = unknown_data.dropna(subset=text_col)[text_col].values[:,0]

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
# actual prediction
predicted = clf.predict(X_new_tfidf)

# assign the classes
unknown_data.loc[unknown_data.dropna(subset=text_col).index,'predicted'] = predicted

In [11]:
unknown_data[text_col+target_col+['predicted']]

Unnamed: 0,Empfänger/Zahlungspflichtiger,Kategorie 1,predicted
0,RESTAURANT SPEISEHAUS,,Lebensmittel
1,"PayPal (Europe) S.a.r.l. etCie., S.C.A.",,Lebensmittel
2,LOGPAY FINANCIAL SERVICES GMBH,,Lebensmittel
3,LOGPAY FINANCIAL SERVICES GMBH,,Lebensmittel
4,DHL Onlinefrankierung,,Lebensmittel
5,studierendenWERK BERLIN,,Lebensmittel
6,"PayPal (Europe) S.a.r.l. etCie., S.C.A.",,Lebensmittel
7,AMAZON PAYMENTS EUROPE S.C.A.,,Lebensmittel
8,AMAZON PAYMENTS EUROPE S.C.A.,,Lebensmittel
9,AMAZON PAYMENTS EUROPE S.C.A.,,Lebensmittel


In [None]:
# Evaluation of the performance on the test set
# Evaluating the predictive accuracy of the model is equally easy:

#ToDo!!