In [54]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.metrics import accuracy_score
from random import random
import numpy as np

# Laden der Daten und Datenaufbereitung

## Text Embeddings

In [16]:
output = pd.read_json("embedded_output.json")
output = output[output["embedding"].str.len() == 300]
output = output[output["WZ2008 Section"] != "NULL"]

In [17]:
embeddingdf = pd.DataFrame(output['embedding'].values.tolist(), index=output.index)
embeddingdf.columns = ['textembeddings' + str(col) for col in embeddingdf.columns]

In [18]:
embeddingdf.head()

Unnamed: 0,textembeddings0,textembeddings1,textembeddings2,textembeddings3,textembeddings4,textembeddings5,textembeddings6,textembeddings7,textembeddings8,textembeddings9,...,textembeddings290,textembeddings291,textembeddings292,textembeddings293,textembeddings294,textembeddings295,textembeddings296,textembeddings297,textembeddings298,textembeddings299
10000,-0.099464,0.13843,-0.139252,-0.111748,0.116443,0.160833,-0.086092,-0.017977,0.033541,0.096816,...,-0.027577,-0.014572,0.132774,0.199758,-0.074856,-0.077762,-0.186939,0.116442,0.053129,0.187582
10001,-0.156383,0.201883,-0.074046,-0.153793,0.169657,0.169687,-0.117523,-0.056078,0.012386,0.099699,...,0.001862,-0.056808,0.154579,0.190569,-0.047787,-0.050849,-0.162833,0.065777,0.006405,0.171045
10002,-0.137489,0.183891,-0.091316,-0.148726,0.142673,0.196547,-0.110675,0.000415,0.067411,0.073013,...,-0.032058,-0.070422,0.093285,0.197763,-0.074207,-0.065621,-0.144966,0.072028,-0.008562,0.121606
10004,-0.128996,0.085962,-0.147864,-0.176685,0.105274,0.109168,-0.133889,0.022752,0.01651,0.116291,...,-0.005294,0.033829,0.090801,0.218233,-0.12284,-0.112638,-0.114931,0.091124,0.031792,0.075483
10005,-0.097746,0.272223,-0.0628,-0.095971,0.200952,0.363097,-0.018803,-0.041603,0.071759,0.042882,...,0.00055,-0.143989,0.207705,0.124247,0.092007,-0.042694,-0.266749,0.107528,-0.057326,0.147409


## Description Distance

In [23]:
abschnitte = pd.read_json("../abschnittsembeddings.json")

In [25]:
cosine_distances = []
for index, textembedding  in embeddingdf.iterrows():
    arr = []
    for abschnittembedding in abschnitte.iterrows(): 
        arr.append(cosine(textembedding, abschnittembedding[1]))
    cosine_distances.append(arr)

In [26]:
cddf = pd.DataFrame(cosine_distances, index=embeddingdf.index)

In [27]:
cddf.columns = [str(col) + '_cosdist' for col in cddf.columns]

In [28]:
txtembdist = embeddingdf.join(cddf)

## Screenshots

<img src="./screenshots/B/www.basalt.de.png" alt="drawing" width="250"/> <img src="./screenshots/A/www.zentis.de.png" alt="drawing" width="250"/>

In [30]:
embimgdata = pd.read_json("embedded_images_output.json")

In [31]:
imagedataset = pd.DataFrame(embimgdata['imagefeatures'].apply(lambda x: x[0]).values.tolist(), index=embimgdata.index)

In [32]:
imagedataset.columns = ['screenshot' + str(col) for col in imagedataset.columns]

In [35]:
txtembdistimg = txtembdist.join(imagedataset, how="left")

In [36]:
txtembdistimg = txtembdistimg.fillna(0)

# Prädiktion der Abschnitte

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(txtembdistimg, output["WZ2008 Section"].astype(str), test_size=0.1, shuffle=True, random_state=42)        

## Prädiktion mit Screenshots

In [38]:
imgonly_classifier = RandomForestClassifier(n_estimators=350)
imgonly_classifier.fit(X_train[[col for col in X_train if col.startswith('screenshot')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [39]:
imgonly_classifier.score(X_test[[col for col in X_train if col.startswith('screenshot')]], Y_test)

0.35

## Prädiktion mit Text Embeddings

In [40]:
textembonly = RandomForestClassifier(n_estimators=400)
textembonly.fit(X_train[[col for col in X_train if col.startswith('textembeddings')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
textembonly.score(X_test[[col for col in X_train if col.startswith('textembeddings')]], Y_test)

0.453125

## Prädiktion mit Description Distances

In [42]:
descdistonly = RandomForestClassifier(n_estimators=350)
descdistonly.fit(X_train[[col for col in X_train if col.endswith('cosdist')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [43]:
descdistonly.score(X_test[[col for col in X_train if col.endswith('cosdist')]], Y_test)

0.4625

## Late Fusion

In [61]:
i = imgonly_classifier.predict_proba(X_test[[col for col in X_train if col.startswith('screenshot')]])

In [62]:
t = textembonly.predict_proba(X_test[[col for col in X_train if col.startswith('textembeddings')]])

In [63]:
d = descdistonly.predict_proba(X_test[[col for col in X_train if col.endswith('cosdist')]])

In [64]:
def acc(a, b, c):
    prediction = np.mean((a*i,b*t,c*d), axis=0).argmax(axis=1)
    vfunc = np.vectorize(lambda x: textembonly.classes_[x])
    prediction = vfunc(prediction)
    return accuracy_score(prediction, Y_test)

In [65]:
randoms = []
accuracies = []
for _ in range(10000):
    a, b, c = [random(), random(), random()]
    z = a+b+c
    a = a/z
    b = b/z
    c = c/z
    randoms.append([a,b,c])
    accuracies.append(acc(a,b,c))

In [66]:
np.array(accuracies).max()

0.46875

In [67]:
np.array(accuracies).argmax()

2743

In [69]:
randoms[2743]

[0.11129243300450602, 0.24206041716784832, 0.6466471498276457]

# Datenaufbereitung zur Vorhersage der Abteilungen

In [70]:
output['Abteilung'] = output["WZ2008 Code"].astype(str).str.strip().str.split(".", expand=True)[0]
output['Gruppe'] = output['Abteilung'].astype(str).map(str) + "." + output["WZ2008 Code"].astype(str).str.strip().str.split(".", expand=True)[1].str[0]

In [71]:
abteilungen = pd.read_json("../abteilungsembeddings.json")

In [72]:
cosine_distances = []
for index, textembedding  in embeddingdf.iterrows():
    arr = []
    for abteilungsembedding in abteilungen.iterrows(): 
        arr.append(cosine(textembedding, abteilungsembedding[1]))
    cosine_distances.append(arr)

In [73]:
abtdf = pd.DataFrame(cosine_distances, index=embeddingdf.index)

In [74]:
abtdf.columns = [str(col) + '_cosdist_abt' for col in abtdf.columns]

In [75]:
txtembdistimg_abt = txtembdistimg.join(abtdf)

# Prädiktion der Abteilungen

In [76]:
X_train, X_test, Y_train, Y_test = train_test_split(txtembdistimg_abt, output["Abteilung"].astype(str), test_size=0.1, shuffle=True, random_state=42)        

## Prädiktion mit Screenshots

In [78]:
imgonly_classifier.fit(X_train[[col for col in X_train if col.startswith('screenshot')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [79]:
imgonly_classifier.score(X_test[[col for col in X_train if col.startswith('screenshot')]], Y_test)

0.0984375

## Prädiktion mit Text Embeddings

In [80]:
textembonly.fit(X_train[[col for col in X_train if col.startswith('textembeddings')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [81]:
textembonly.score(X_test[[col for col in X_train if col.startswith('textembeddings')]], Y_test)

0.1984375

## Prädiktion mit Description Distances

### Description Distances "Abschnitte"

In [82]:
descdistonly.fit(X_train[[col for col in X_train if col.endswith('cosdist')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [83]:
descdistonly.score(X_test[[col for col in X_train if col.endswith('cosdist')]], Y_test)

0.1890625

### Description Distances "Abteilungen"

In [90]:
descdistonly_abteilungen = RandomForestClassifier(n_estimators=350)

In [91]:
descdistonly_abteilungen.fit(X_train[[col for col in X_train if col.endswith('cosdist_abt')]], Y_train)
descdistonly_abteilungen.score(X_test[[col for col in X_train if col.endswith('cosdist_abt')]], Y_test)

0.2078125

## Late Fusion

In [92]:
i = imgonly_classifier.predict_proba(X_test[[col for col in X_train if col.startswith('screenshot')]])

In [93]:
t = textembonly.predict_proba(X_test[[col for col in X_train if col.startswith('textembeddings')]])

In [94]:
d = descdistonly.predict_proba(X_test[[col for col in X_train if col.endswith('cosdist')]])

In [95]:
d_abt = descdistonly_abteilungen.predict_proba(X_test[[col for col in X_train if col.endswith('cosdist_abt')]])

In [96]:
def acc(a, b, c, e):
    prediction = np.mean((a*i,b*t,c*d, d_abt*e), axis=0).argmax(axis=1)
    vfunc = np.vectorize(lambda x: textembonly.classes_[x])
    prediction = vfunc(prediction)
    return accuracy_score(prediction, Y_test)

In [97]:
randoms = []
accuracies = []
for _ in range(10000):
    a, b, c, e = [random(), random(), random(), random()]
    z = a+b+c+e
    a = a/z
    b = b/z
    c = c/z
    e = e/z
    randoms.append([a,b,c,e])
    accuracies.append(acc(a,b,c,e))

In [98]:
np.array(accuracies).max()

0.21875

In [99]:
np.array(accuracies).argmax()

5354

In [100]:
randoms[5354]

[0.03639304471958594,
 0.6096533456360916,
 0.09914855130933253,
 0.25480505833498984]

# Wikipedia

## Datenaufbereitung

In [109]:
wiki_embeddings = pd.read_json('wiki_embedding.json')

In [110]:
wiki_tmp = pd.DataFrame(wiki_embeddings['embedding'].values.tolist(), index=wiki_embeddings.index)

In [111]:
wiki_tmp.columns = [str(col) + '_wiki' for col in wiki_tmp.columns]

In [112]:
wiki_embeddings = wiki_embeddings.join(wiki_tmp, how="left")

In [118]:
output["url"] = output["url"].str.replace("https://", '')

In [121]:
wiki_embeddings = wiki_embeddings.merge(output, how="left", on="url")

In [127]:
wiki_embeddings = wiki_embeddings[wiki_embeddings["text"].notnull()]

## Prädiktion

### Text Embeddings

In [129]:
X_train, X_test, Y_train, Y_test = train_test_split(wiki_embeddings, wiki_embeddings["WZ2008 Code"].astype(str), test_size=0.1, shuffle=True, random_state=42)        

In [130]:
wikipedia_rf = RandomForestClassifier(n_estimators=350)

In [132]:
wikipedia_rf.fit(X_train[[col for col in X_train if col.endswith('wiki')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [133]:
wikipedia_rf.score(X_test[[col for col in X_train if col.endswith('wiki')]], Y_test)

0.3137254901960784

### Description Distances

In [135]:
cosine_distances = []
for index, textembedding  in wiki_embeddings[[col for col in X_train if col.endswith('wiki')]].iterrows():
    arr = []
    for abschnittembedding in abschnitte.iterrows(): 
        arr.append(cosine(textembedding, abschnittembedding[1]))
    cosine_distances.append(arr)

In [137]:
wiki_cddf = pd.DataFrame(cosine_distances, index=wiki_embeddings.index)

In [138]:
wiki_cddf.columns = [str(col) + '_cosdist_wiki' for col in wiki_cddf.columns]

In [139]:
wiki_embeddings = wiki_embeddings.join(wiki_cddf)

In [143]:
X_train, X_test, Y_train, Y_test = train_test_split(wiki_embeddings, wiki_embeddings["WZ2008 Code"].astype(str), test_size=0.1, shuffle=True, random_state=42)        

In [144]:
wikipedia_rfdist = RandomForestClassifier(n_estimators=350)

In [145]:
wikipedia_rfdist.fit(X_train[[col for col in X_train if col.endswith('_cosdist_wiki')]], Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [146]:
wikipedia_rfdist.score(X_test[[col for col in X_train if col.endswith('_cosdist_wiki')]], Y_test)

0.29411764705882354

In [65]:
from sklearn import svm
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn.datasets import load_digits
import numpy as np

def make_digits_dataset(targets=None, as_str=True):
    X, y = load_digits(return_X_y=True)
    if targets:
        ix = np.isin(y, targets)
        X, y = X[np.where(ix)], y[np.where(ix)]

    if as_str:
        # Convert targets (classes) to strings
        y = y.astype(str)

    return X, y


# Used for seeding random state
RANDOM_STATE = 42

class_hierarchy = {
    ROOT: ["A", "B"],
    "A": ["C", "D"],
    "B": ["E", "F", "G"],
    "C": [9],
    "D": [1,5],
    "E": [2,3],
    "F": [4,6],
    "G": [7,8],
}
base_estimator = make_pipeline(
    RandomForestClassifier(n_estimators=100)
)
clf = HierarchicalClassifier(
    base_estimator=base_estimator,
    class_hierarchy=class_hierarchy,
)
X, y = make_digits_dataset(
    targets=[1, 2,3,4,5,6,7,8,9],
    as_str=False,
)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           1       0.97      0.92      0.95        39
           2       0.95      1.00      0.97        36
           3       1.00      0.97      0.98        31
           4       0.92      0.97      0.95        36
           5       0.96      0.98      0.97        45
           6       0.97      1.00      0.99        33
           7       0.92      1.00      0.96        35
           8       1.00      0.89      0.94        38
           9       1.00      0.94      0.97        31

   micro avg       0.96      0.96      0.96       324
   macro avg       0.97      0.96      0.96       324
weighted avg       0.96      0.96      0.96       324

