# GELECTRA 
Name: Niklas Donhauser

**Source**

[1] sklearn https://scikit-learn.org/stable/ <br>
[2] re https://docs.python.org/3/library/re.html <br>
[3] pandas https://pandas.pydata.org/ <br>
[4] time https://docs.python.org/3/library/time.html <br>
[5] numpy https://numpy.org/ <br>
[6] Simple Transformers https://simpletransformers.ai/ <br>

**Useful links:**
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html <br>

## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import numpy as np
import re
import logging
import time

## Split data

In [2]:
def splitData(data):
    global train_index, test_index
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=2)
    iteration=0

    
    for train_index, test_index in skf.split(data,data["labels"]):
        iteration=iteration+1
        startTime=time.time()
        x_train, x_test, y_train, y_test=data.text.values[train_index],data.text.values[test_index],data.labels.values[train_index],data.labels.values[test_index]
        training_Df=pd.DataFrame({"labels":y_train,"text":x_train},columns=["text","labels"])
        test_Df=pd.DataFrame({"labels":y_test,"text":x_test},columns=["text","labels"])

        trainModel(training_Df)
        predictModel(test_Df,x_train,x_test,y_train,y_test,startTime,iteration)

## Train model

In [3]:
def trainModel(training_Df):
    global model
    print("Training Model Now")
    print(training_Df)
    train_args ={"reprocess_input_data": True,
             "fp16":False,
             "num_train_epochs": 4,
             "overwrite_output_dir":True,
             "train_batch_size": 32, 
             "eval_batch_size": 32,
             "use_multiprocessing":False,
             "use_multiprocessing_for_evaluation":False,
             "no_save":True} 
    
    model=ClassificationModel('electra', 'deepset/gelectra-large', num_labels=3, use_cuda=True, args=train_args)
    
    model.train_model(training_Df)

## Predict model

In [4]:
def predictModel(test_Df,x_train,x_test,y_train,y_test,startTime,iteration):

    def p_multiclass(labels, preds):
        #print("Labels: ", labels)
        #print("Preds: ", preds)
        return preds
    result, model_outputs, wrong_predictions=model.eval_model(test_Df,acc=accuracy_score,precision=p_multiclass)
    

    getData(result["precision"],y_train,y_test,startTime,iteration)
    
    

## Get data

In [5]:
def getData(test_set,y_train,y_test,startTime,iteration):
   
    corporaType=""
    shortcut=""
    name=""
    totalTextUnits=0
    totalTestUnits=0
    totalTrainUnits=0
    accuracy=0
    f1_macro=0
    precision_macro=0
    recall_macro=0
    f1_micro=0
    precision_micro=0
    recall_micro=0
    matrix=[]
    f1_binary=0
    precision_binary=0
    recall_binary=0

    splitString=re.split("_|/",file)
    corporaType=splitString[8]
    shortcut=splitString[9]
    name=splitString[10]

    totalTime=time.time()-startTime

    totalTextUnits=len(data.index)
    totalTestUnits=len(test_set)
    totalTrainUnits=len(y_train)

    accuracy=accuracy_score(y_test, test_set)
    
    
    f1_macro=f1_score(y_test, test_set, average="macro", labels=[1,0,2])
    precision_macro=precision_score(y_test, test_set, average="macro", labels=[1,0,2])
    recall_macro=recall_score(y_test, test_set, average="macro", labels=[1,0,2])

    f1_micro=f1_score(y_test, test_set, average="micro")
    precision_micro=precision_score(y_test, test_set, average="micro", labels=[1,0,2])
    recall_micro=recall_score(y_test, test_set, average="micro", labels=[1,0,2])

    matrix=confusion_matrix(y_test, test_set, labels=[1,0,2])

        
    matrixFlat=convertMatrix(matrix)
    target_names = [0,1,2]
    classificationReport=classification_report(y_test, test_set, target_names=target_names, output_dict=True)
    saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,y_test,test_set,startTime,iteration,matrixFlat,classificationReport)

## Transform confusion matrix

In [6]:
def convertMatrix(matrix):
    global flatMatrix
    array=[]
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            array.append(matrix[i][j])
    flatMatrix = np.array(array)
    return flatMatrix

## Save data

In [7]:
def saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,y_test,test_set,startTime,iteration, matrixFlat,classificationReport):
    #print(matrixFlat)
    df_svm_data=pd.read_csv("GElectraDataKFold.tsv", sep="\t")
    df_svm_data_full=pd.read_csv("GElectraDataKFoldFull.tsv", sep="\t")

    allData={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat}

    allDataFull={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat,"Train Set Full":y_test,"Test Set Full":test_set}

    
    reportDict=transformReport(classificationReport)
    allData.update(reportDict)
    allDataFull.update(reportDict)
    df_new_data=pd.DataFrame([allData])
    df_new_data_full=pd.DataFrame([allDataFull])

    finalData_svm=pd.concat([df_svm_data,df_new_data])
    finalData_svm_full=pd.concat([df_svm_data_full,df_new_data_full])
    
    finalData_svm=finalData_svm[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","2 f1-score","2 support","2 precision","2 recall","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]

    finalData_svm_full=finalData_svm_full[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","Train Set Full","Test Set Full","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","2 f1-score","2 support","2 precision","2 recall","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]
    
    finalData_svm.to_csv("GElectraDataKFold.tsv", sep="\t",index=False)
    finalData_svm_full.to_csv("GElectraDataKFoldFull.tsv", sep="\t",index=False)

## Transform classification report

In [8]:
def transformReport(classificationReport):
    newDict={}
    for key in classificationReport.keys():
        mainName=str(key)
        if type(classificationReport[key]) != dict:
            name=mainName+" "+key
            newDict[name]=classificationReport[key]
            
        if type(classificationReport[key]) == dict:
            for k in classificationReport[key].keys():
                name=mainName+" "+k
                newDict[name]=classificationReport[key][k]
    return newDict

## main function

In [9]:
filesTernaryTest=["../../../Corpora/Preprocessed_Transformers_No_Preprocessing/Ternary/SM04_gertwittersent_Preprocessed_ternary_Transformer.tsv"]

def main():
    global data, file
    for file in filesTernaryTest:
        print("Start for corpora: ",file)
        data=pd.read_csv(file, sep="\t")
        splitData(data)
        print("Finish")

main()


Start for corpora:  ../../../Corpora/Preprocessed_Transformers_No_Preprocessing/Ternary/SM04_gertwittersent_Preprocessed_ternary_Transformer.tsv
[ 3  4  6  7 10 12 13 14 16 17 22 23 24 27 29 31 32 34 36 38 39 40 41 43
 44 46 47 48 49 50]
---------------------------
[ 0  1  2  5  6  8  9 11 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 29 30 31 32 33 34]
---------------------------
2 ter
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 18 19 20 21 22 23 25 26
 27 28 30 32 33 35]
Training Model Now
                                                    text  labels
0      @TuT_Parody so einen Rasen hätte sich Hoeneß d...       1
1      RT @heisec: Apples iCloud verschickt und empfä...       2
2                          @kopfding Pass bloß auf!!! ;)       0
3      @kopfding @Stephan535 Es gibt nichts antikeres...       1
4      Abstiegsangst! - Kind will mit Korkut sprechen...       1
...                                                  ...     ...
48371  3D Animation: Vettel und Ricciard

Some weights of the model checkpoint at deepset/gelectra-large were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at deepset/gelectra-large and are newly initialized: ['classifier.out_proj.weight', 'clas

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/1512 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Running Epoch 2 of 4:   0%|          | 0/1512 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/1512 [00:00<?, ?it/s]

-------------------------


Running Evaluation:   0%|          | 0/504 [00:00<?, ?it/s]

{'mcc': 0.46412179450529084, 'acc': 0.6896744186046512, 'precision': array([0, 2, 0, ..., 2, 2, 2]), 'eval_loss': 1.1207546996218818}
----------------
{0: {'precision': 0.6090279436350609, 'recall': 0.6193830459072139, 'f1-score': 0.6141618497109825, 'support': 4117}, 1: {'precision': 0.6343963553530751, 'recall': 0.5728488172780254, 'f1-score': 0.6020536840208972, 'support': 2917}, 2: {'precision': 0.7416165090283748, 'recall': 0.7589924100758992, 'f1-score': 0.7502038597444957, 'support': 9091}, 'accuracy': 0.6896744186046512, 'macro avg': {'precision': 0.661680269338837, 'recall': 0.6504080910870461, 'f1-score': 0.6554731311587917, 'support': 16125}, 'weighted avg': {'precision': 0.6883682416178245, 'recall': 0.6896744186046512, 'f1-score': 0.6886696570844206, 'support': 16125}}
DICT: {'0 precision': 0.6090279436350609, '0 recall': 0.6193830459072139, '0 f1-score': 0.6141618497109825, '0 support': 4117, '1 precision': 0.6343963553530751, '1 recall': 0.5728488172780254, '1 f1-score':