## Voraussetzungen

In [None]:
!pip show numpy


### Daten herunterladen [Germeval 2019](https://projects.fzai.h-da.de/iggsa/projekt/)

In [None]:
!wget https://projects.fzai.h-da.de/iggsa/wp-content/uploads/2019/08/germeval2019GoldLabelsSubtask1_2.txt
!wget https://projects.fzai.h-da.de/iggsa/wp-content/uploads/2019/09/germeval2019.training_subtask1_2_korrigiert.txt

 ### [Simple Transformers](https://simpletransformers.ai/) Bibliothek instalieren

In [None]:
!pip install simpletransformers

### Notwendige Bibliotheken importieren

In [2]:

import os
import tarfile
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs


AttributeError: type object 'h5py.h5.H5PYConfig' has no attribute '__reduce_cython__'

### Hilfsfunktionen

In [None]:
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')


def pack_model(model_path='',file_name=''):
    files = [files for root, dirs, files in os.walk(model_path)][0]
    with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
        for file in files:
            f.add(f'{model_path}/{file}')

def unpack_model(model_name=''):
    tar = tarfile.open(f"{model_name}.tar.gz", "r:gz")
    tar.extractall()
    tar.close()



## Daten aufbereiten

In [None]:

class_list = ['INSULT','ABUSE','PROFANITY','OTHER']

df1 = pd.read_csv('germeval2019GoldLabelsSubtask1_2.txt',sep='\t', lineterminator='\n',encoding='utf8',names=["tweet", "task1", "task2"])
df2 = pd.read_csv('germeval2019.training_subtask1_2_korrigiert.txt',sep='\t', lineterminator='\n',encoding='utf8',names=["tweet", "task1", "task2"])

df = pd.concat([df1,df2])
df['task2'] = df['task2'].str.replace('\r', "")
df['pred_class'] = df.apply(lambda x:  class_list.index(x['task2']),axis=1)

df = df[['tweet','pred_class']]

print(df.shape)
df.head()

### Daten aufteilen und Modell laden

In [None]:
train_df, test_df = train_test_split(df, test_size=0.10)

print('Train shape: ',train_df.shape)
print('Test shape: ',test_df.shape)

# Hyperparameter definieren
train_args = ClassificationArgs(fp16=False, reprocess_input_data=True, num_train_epochs=4, overwrite_output_dir=True)
# Create a ClassificationModel
model = ClassificationModel(
    "bert", "distilbert-base-german-cased",
    num_labels=4,
    args=train_args
)

## Modell trainieren

In [None]:
model.train_model(train_df)
result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_multiclass, acc=accuracy_score)

# {'acc': 0.6894586894586895,
# 'eval_loss': 0.8673831869594075,
# 'f1': 0.6894586894586895,
# 'mcc': 0.25262380289641617}

# # Modell speichern
# pack_model('output_path','model_name')
# # Modell laden
# unpack_model('model_name')