# Recurrent Neural Network / Binary
Name: Niklas Donhauser

**Source**

[1] sklearn https://scikit-learn.org/stable/ <br>
[2] re https://docs.python.org/3/library/re.html <br>
[3] pandas https://pandas.pydata.org/ <br>
[4] time https://docs.python.org/3/library/time.html <br>
[5] numpy https://numpy.org/ <br>
[6] keras https://keras.io/ <br>

**Useful links:**
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html <br>

## Import libraries

In [1]:
import numpy as np 
import pandas as pd 
import re
import time

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

## Tokenize

In [2]:
def tokenize_data(data):
    global X, max_features
    max_features = 2000
    tokenizer = Tokenizer(num_words = max_features, split = ' ')
    tokenizer.fit_on_texts(data['preprocessedData'].values)
    X = tokenizer.texts_to_sequences(data['preprocessedData'].values)
    X = pad_sequences(X)

## Setup model

In [3]:
def setup_model():
    global model
    embed_dim = 128
    lstm_out = 196

    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout = 0.2, recurrent_dropout = 0))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])   

## Split data

In [4]:
def split_data():
    global train_index, test_index
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=2)
    iteration=0
    Y = pd.get_dummies(data['sentiment']).values
    
    for train_index, test_index in skf.split(data,data["sentiment"]):
        iteration=iteration+1
        start_time=time.time()
        setup_model()
        X_train, X_test, Y_train, Y_test=X[train_index], X[test_index], Y[train_index], Y[test_index]
        train_model(X_train, X_test, Y_train, Y_test, iteration, start_time)

        

## Train model

In [5]:
def train_model(X_train, X_test, Y_train, Y_test, iteration, start_time):
    global batch_size, verbose 
    batch_size = 32
    epochs = 4
    verbose = 1 
    model.fit(X_train, Y_train, epochs = epochs, batch_size = batch_size, verbose = verbose)
    predict_model(X_train, X_test, Y_train, Y_test, iteration, start_time)

## Predict model

In [6]:
def predict_model(X_train, X_test, Y_train, Y_test, iteration, start_time):
    predicted_classes = []
    Y_test_reshape = []
    
    prediction = model.predict(X_test)
    for items in range(len(prediction)):
        predicted_classes.append(np.argmax(prediction[items]))
        Y_test_reshape.append(np.argmax(Y_test[items]))
        
    
    get_data(predicted_classes, Y_train, Y_test_reshape, iteration, start_time)

## Get data

In [7]:
def get_data(predicted_classes, Y_train, Y_test_reshape, iteration, start_time):
    corporaType=""
    shortcut=""
    name=""
    totalTextUnits=0
    totalTestUnits=0
    totalTrainUnits=0
    accuracy=0
    f1_macro=0
    precision_macro=0
    recall_macro=0
    f1_micro=0
    precision_micro=0
    recall_micro=0
    matrix=[]
    f1_binary=0
    precision_binary=0
    recall_binary=0

    splitString=re.split("_|/",file)
    corporaType=splitString[5]
    shortcut=splitString[6]
    name=splitString[7]

    totalTime=time.time()-start_time

    totalTextUnits=len(data.index)
    totalTestUnits=len(predicted_classes)
    totalTrainUnits=len(Y_train)

    accuracy=accuracy_score(Y_test_reshape, predicted_classes)
    
    f1_binary=f1_score(Y_test_reshape, predicted_classes, average="binary",pos_label=0)
    precision_binary=precision_score(Y_test_reshape, predicted_classes, average="binary",pos_label=0)
    recall_binary=recall_score(Y_test_reshape, predicted_classes, average="binary",pos_label=0)
        
    matrix=confusion_matrix(Y_test_reshape, predicted_classes, labels=[1,0])
        
    matrixFlat=convertMatrix(matrix)
    target_names = [0,1]
    classificationReport=classification_report(Y_test_reshape, predicted_classes, target_names=target_names, output_dict=True)
    saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,
             f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,predicted_classes, Y_test_reshape,start_time,iteration,matrixFlat,
             classificationReport)   

## Transform confusion matrix

In [8]:
def convertMatrix(matrix):
    global flatMatrix
    array=[]
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            array.append(matrix[i][j])
    flatMatrix = np.array(array)
    return flatMatrix

## Save data

In [9]:
def saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,y_test,test_set,startTime,iteration, matrixFlat,classificationReport):
    df_svm_data=pd.read_csv("RNNDataKFold.tsv", sep="\t")
    df_svm_data_full=pd.read_csv("RNNDataKFoldFull.tsv", sep="\t")

    allData={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat}

    allDataFull={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat,"Train Set Full":y_test,"Test Set Full":test_set}

    
    reportDict=transformReport(classificationReport)
    allData.update(reportDict)
    allDataFull.update(reportDict)

    df_new_data=pd.DataFrame([allData])
    df_new_data_full=pd.DataFrame([allDataFull])

    finalData_svm=pd.concat([df_svm_data,df_new_data])
    finalData_svm_full=pd.concat([df_svm_data_full,df_new_data_full])
    
    finalData_svm=finalData_svm[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]

    finalData_svm_full=finalData_svm_full[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","Train Set Full","Test Set Full","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]
    

## Transform classification report

In [10]:
def transformReport(classificationReport):
    newDict={}
    for key in classificationReport.keys():
        mainName=str(key)
        if type(classificationReport[key]) != dict:
            name=mainName+" "+key
            newDict[name]=classificationReport[key]
            
        if type(classificationReport[key]) == dict:
            for k in classificationReport[key].keys():
                name=mainName+" "+k
                newDict[name]=classificationReport[key][k]
    return newDict

## Main function

In [11]:
files_binary=["../../../Corpora/Preprocessed/Binary/LT01_gnd_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/LT02_speechLessing_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/LT03_historicplays_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/MI01_mlsa_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/MI02_germeval_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/MI03_corpusRauh_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/NA01_gersen_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/NA02_gerom_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/NA03_ompc_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/RE01_usage_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/RE03_critics_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/SM01_sb10k_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/SM02_potts_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/SM03_multiSe_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/SM04_gertwittersent_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/SM05_ironycorpus_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/SM06_celeb_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/RE02_scare_Preprocessed_binary_balanced.tsv",
                 "../../../Corpora/Preprocessed/Binary/RE04_filmstarts_Preprocessed_binary.tsv",
                 "../../../Corpora/Preprocessed/Binary/RE05_amazonreviews_Preprocessed_binary_balanced.tsv"
                 ]
def main():
    global data, file
    for file in files_binary:
        print("Start for corpora: ", file)
        data=pd.read_csv(file, sep="\t")
        tokenize_data(data)
        split_data()
        print("Finish")

main()


Start for corpora:  ../../../Corpora/Preprocessed/Binary/RE05_amazonreviews_Preprocessed_binary_balanced.tsv


2022-11-17 09:02:44.169035: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-11-17 09:02:44.169097: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: c4bc656f9d01
2022-11-17 09:02:44.169115: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: c4bc656f9d01
2022-11-17 09:02:44.169298: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.65.1
2022-11-17 09:02:44.169337: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1
2022-11-17 09:02:44.169352: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 515.65.1
2022-11-17 09:02:44.169766: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instruct

Epoch 1/4
   4/1641 [..............................] - ETA: 1:02:49 - loss: 0.6893 - accuracy: 0.6016

KeyboardInterrupt: 