# GBERT 
Name: Niklas Donhauser

**Source**

[1] sklearn https://scikit-learn.org/stable/ <br>
[2] re https://docs.python.org/3/library/re.html <br>
[3] pandas https://pandas.pydata.org/ <br>
[4] time https://docs.python.org/3/library/time.html <br>
[5] numpy https://numpy.org/ <br>
[6] Simple Transformers https://simpletransformers.ai/ <br>

**Useful links:**
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html <br>

## Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import numpy as np
import re
import logging
import time

## Split data

In [2]:
def splitData(data):
    global train_index, test_index
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=2)
    iteration=0
    
    for train_index, test_index in skf.split(data,data["labels"]):
  
        iteration=iteration+1
        startTime=time.time()
        x_train, x_test, y_train, y_test=data.text.values[train_index],data.text.values[test_index],data.labels.values[train_index],data.labels.values[test_index]
        training_Df=pd.DataFrame({"labels":y_train,"text":x_train},columns=["text","labels"])
        test_Df=pd.DataFrame({"labels":y_test,"text":x_test},columns=["text","labels"])

        trainModel(training_Df)
        predictModel(test_Df,x_train,x_test,y_train,y_test,startTime,iteration)
           

## Train model

In [3]:
def trainModel(training_Df):
    global model
    print("Training Model Now")
    print(training_Df)
    train_args ={"reprocess_input_data": True,
             "fp16":False,
             "num_train_epochs": 4,
             "overwrite_output_dir":True,
             "train_batch_size": 32, 
             "eval_batch_size": 32,
             "use_multiprocessing":False,
             "use_multiprocessing_for_evaluation":False,
             "no_save":True} 
    
    model=ClassificationModel('bert', 'deepset/gbert-large', num_labels=2, use_cuda=True, args=train_args)
    
    
    model.train_model(training_Df)

## Predict model

In [4]:
def predictModel(test_Df,x_train,x_test,y_train,y_test,startTime,iteration):
    def p_multiclass(labels, preds):
        #print("Labels: ", labels)
        #print("Preds: ", preds)
        return preds
    result, model_outputs, wrong_predictions=model.eval_model(test_Df,acc=accuracy_score,precision=p_multiclass)
    
    getData(result["precision"],y_train,y_test,startTime,iteration)
    
    

## Get data

In [5]:
def getData(test_set,y_train,y_test,startTime,iteration):
   
    corporaType=""
    shortcut=""
    name=""
    totalTextUnits=0
    totalTestUnits=0
    totalTrainUnits=0
    accuracy=0
    f1_macro=0
    precision_macro=0
    recall_macro=0
    f1_micro=0
    precision_micro=0
    recall_micro=0
    matrix=[]
    f1_binary=0
    precision_binary=0
    recall_binary=0

    splitString=re.split("_|/",file)
    corporaType=splitString[8]
    shortcut=splitString[9] 
    name=splitString[10]

    totalTime=time.time()-startTime

    totalTextUnits=len(data.index)
    totalTestUnits=len(test_set)
    totalTrainUnits=len(y_train)

    accuracy=accuracy_score(y_test, test_set)
    
    f1_binary=f1_score(y_test, test_set, average="binary",pos_label=0)
    precision_binary=precision_score(y_test, test_set, average="binary",pos_label=0)
    recall_binary=recall_score(y_test, test_set, average="binary",pos_label=0)
        
    matrix=confusion_matrix(y_test, test_set, labels=[1,0])
        
    matrixFlat=convertMatrix(matrix)
    target_names = [0,1]
    classificationReport=classification_report(y_test, test_set, target_names=target_names, output_dict=True)
    saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,y_test,test_set,startTime,iteration,matrixFlat,classificationReport)

## Transform confusion matrix

In [6]:
def convertMatrix(matrix):
    global flatMatrix
    array=[]
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            array.append(matrix[i][j])
    flatMatrix = np.array(array)
    return flatMatrix

## Save data

In [7]:
def saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,y_test,test_set,startTime,iteration, matrixFlat,classificationReport):
    df_svm_data=pd.read_csv("GBertLargeDataKFold.tsv", sep="\t")
    df_svm_data_full=pd.read_csv("GBertLargeDataKFoldFull.tsv", sep="\t")

    allData={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat}

    allDataFull={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat,"Train Set Full":y_test,"Test Set Full":test_set}

    
    reportDict=transformReport(classificationReport)
    allData.update(reportDict)
    allDataFull.update(reportDict)
    df_new_data=pd.DataFrame([allData])
    df_new_data_full=pd.DataFrame([allDataFull])

    finalData_svm=pd.concat([df_svm_data,df_new_data])
    finalData_svm_full=pd.concat([df_svm_data_full,df_new_data_full])
    
    finalData_svm=finalData_svm[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]

    finalData_svm_full=finalData_svm_full[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","Train Set Full","Test Set Full","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]
    
    finalData_svm.to_csv("GBertLargeDataKFold.tsv", sep="\t",index=False)
    finalData_svm_full.to_csv("GBertLargeDataKFoldFull.tsv", sep="\t",index=False)

## Transform classification report

In [8]:
def transformReport(classificationReport):
    print(classificationReport)
    newDict={}
    for key in classificationReport.keys():
        mainName=str(key)
        if type(classificationReport[key]) != dict:
            name=mainName+" "+key
            newDict[name]=classificationReport[key]
            
        if type(classificationReport[key]) == dict:
            for k in classificationReport[key].keys():
                name=mainName+" "+k
                newDict[name]=classificationReport[key][k]

    print("DICT:",newDict)
    return newDict

## Main function

In [9]:
filesBinaryTest=["../../../Corpora/Preprocessed_Transformers_No_Preprocessing/Binary/SM04_gertwittersent_Preprocessed_binary_Transformer.tsv"]

def main():
    global data, file
    for file in filesBinaryTest:
        print("Start for corpora: ",file)
        data=pd.read_csv(file, sep="\t")
        #updateFile(data)
        splitData(data)
        #print(data)
        print("Finish")

main()


Start for corpora:  ../../../Corpora/Preprocessed_Transformers_No_Preprocessing/Binary/SM04_gertwittersent_Preprocessed_binary_Transformer.tsv
[ 1  2  3  4  5  6  7  8  9 12 14 15 16 17 18 19 21 23 24 25 27 28 29 30
 31 32 33 34 35 37]
---------------------------
[ 0  4  5  6  8  9 10 11 12 13 15 16 18 19 20 21 22 23 24 25 26 27 28 30
 32 33 34 35 36 37]
---------------------------
[ 0  1  2  3  4  6  7  8  9 10 11 13 14 15 17 18 20 22 24 26 27 28 29 30
 31 32 33 36 37 42]
---------------------------
4ter:
[ 0  1  2  3  5  7 10 11 12 13 14 16 17 19 20 21 22 23 25 26 29 31 34 35
 36 38 39 40 41 43]
Training Model Now
                                                    text  labels
0      @TuT_Parody so einen Rasen hätte sich Hoeneß d...       1
1                          @kopfding Pass bloß auf!!! ;)       0
2      @kopfding @Stephan535 Es gibt nichts antikeres...       1
3      Abstiegsangst! - Kind will mit Korkut sprechen...       1
4                  @kopfding @KleinePfeife @tob1i N

Some weights of the model checkpoint at deepset/gbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/660 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/660 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/660 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/660 [00:00<?, ?it/s]

-------------------------


Running Evaluation:   0%|          | 0/220 [00:00<?, ?it/s]

{'mcc': 0.7183286977971644, 'tp': 2438, 'tn': 3634, 'fp': 483, 'fn': 479, 'auroc': 0.929714906519445, 'auprc': 0.8932849035631544, 'acc': 0.8632357122547626, 'precision': array([0, 0, 1, ..., 1, 1, 0]), 'eval_loss': 0.5806435945528474}
----------------
{0: {'precision': 0.8835399951373694, 'recall': 0.88268156424581, 'f1-score': 0.8831105710814096, 'support': 4117}, 1: {'precision': 0.8346456692913385, 'recall': 0.8357901954062393, 'f1-score': 0.8352175402535116, 'support': 2917}, 'accuracy': 0.8632357122547626, 'macro avg': {'precision': 0.859092832214354, 'recall': 0.8592358798260247, 'f1-score': 0.8591640556674606, 'support': 7034}, 'weighted avg': {'precision': 0.8632635168187922, 'recall': 0.8632357122547626, 'f1-score': 0.8632493298353222, 'support': 7034}}
DICT: {'0 precision': 0.8835399951373694, '0 recall': 0.88268156424581, '0 f1-score': 0.8831105710814096, '0 support': 4117, '1 precision': 0.8346456692913385, '1 recall': 0.8357901954062393, '1 f1-score': 0.8352175402535116, 