# Convolutional Neural Network / Binary
Name: Niklas Donhauser

**Source**

[1] sklearn https://scikit-learn.org/stable/ <br>
[2] re https://docs.python.org/3/library/re.html <br>
[3] pandas https://pandas.pydata.org/ <br>
[4] time https://docs.python.org/3/library/time.html <br>
[5] numpy https://numpy.org/ <br>
[6] keras https://keras.io/ <br>

**Useful links:**
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html <br>

## Import libraries

In [1]:
import numpy as np 
import pandas as pd 
import re
import time

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, Flatten
from keras.utils.np_utils import to_categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

## Tokenize

In [2]:
def tokenize_data(X_train, X_test):
    global vocab_size, max_length, X_train_encoded, X_test_encoded
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(X_train)
    
    vocab_size=len(tokenizer.word_index) + 1
    max_length=max([len(row.split()) for row in X_train])
    
    X_train_encoded = encode_reviews(tokenizer, max_length, X_train)
    X_test_encoded = encode_reviews(tokenizer, max_length, X_test)
    

## Encode

In [3]:
def encode_reviews(tokenizer, max_length, X_train_processed):
    encoded=tokenizer.texts_to_sequences(X_train_processed)
    
    padded=pad_sequences(encoded, maxlen=max_length, padding="post")
    
    return padded

## Setup model

In [4]:
def setup_model():
    global model
    model=Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(32, 8, activation="relu"))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(10, activation="relu"))
    model.add(Dense(1,  activation="sigmoid"))   
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])   

## Split data

In [5]:
def split_data():
    global train_index, test_index
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=2)
    iteration=0
    data["sentiment"].replace(to_replace="positive", value=1, regex=True, inplace=True)
    data["sentiment"].replace(to_replace="negative", value=0, regex=True, inplace=True)
    data["sentiment"].replace(to_replace="neutral", value=2, regex=True, inplace=True)
    for train_index, test_index in skf.split(data["preprocessedData"],data["sentiment"]):
        iteration=iteration+1
        start_time=time.time()
        
        X_train, X_test, Y_train, Y_test=data["preprocessedData"][train_index], data["preprocessedData"][test_index], data["sentiment"][train_index], data["sentiment"][test_index]
        tokenize_data(X_train, X_test)
        setup_model()
        train_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time)

        

## Train model

In [6]:
def train_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time):
    global batch_size, verbose 
    batch_size = 32
    epochs = 4
    verbose = 1 
    model.fit(X_train_encoded, Y_train, epochs = epochs, batch_size = batch_size, verbose = verbose)
    predict_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time)

## Predict model

In [7]:
def predict_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time):
    predicted_classes = []
    #Y_test_reshape = []
    _,acc= model.evaluate(X_test_encoded, Y_test, verbose=0)
    print("Test accuracy:{:.2f}".format(acc*100))
    
    prediction = model.predict(X_test_encoded)
    for items in range(len(prediction)):
        test = prediction[items]
        if test <0.5:
            predicted_classes.append(0)
        if test >0.5:
            predicted_classes.append(1)
        if test == 0.5:
            print("ERROR")
            
    print(predicted_classes.count(0))
    print('------------------')
    print(predicted_classes.count(1))
    
    accuracy=accuracy_score(Y_test, predicted_classes)
    print(accuracy)
    
    Y_test=Y_test.tolist()
    get_data(predicted_classes, Y_train, Y_test, iteration, start_time)

## Get data

In [8]:
def get_data(predicted_classes, Y_train, Y_test_reshape, iteration, start_time):
    corporaType=""
    shortcut=""
    name=""
    totalTextUnits=0
    totalTestUnits=0
    totalTrainUnits=0
    accuracy=0
    f1_macro=0
    precision_macro=0
    recall_macro=0
    f1_micro=0
    precision_micro=0
    recall_micro=0
    matrix=[]
    f1_binary=0
    precision_binary=0
    recall_binary=0

    splitString=re.split("_|/",file)
    corporaType=splitString[6]
    shortcut=splitString[7]
    name=splitString[8]

    totalTime=time.time()-start_time

    totalTextUnits=len(data.index)
    totalTestUnits=len(predicted_classes)
    totalTrainUnits=len(Y_train)

    accuracy=accuracy_score(Y_test_reshape, predicted_classes)
    f1_binary=f1_score(Y_test_reshape, predicted_classes, average="binary",pos_label=0)
    precision_binary=precision_score(Y_test_reshape, predicted_classes, average="binary",pos_label=0)
    recall_binary=recall_score(Y_test_reshape, predicted_classes, average="binary",pos_label=0)
        
    matrix=confusion_matrix(Y_test_reshape, predicted_classes, labels=[1,0])
        
    matrixFlat=convertMatrix(matrix)
    target_names = [0,1]
    classificationReport=classification_report(Y_test_reshape, predicted_classes, target_names=target_names, output_dict=True)
    saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,
             f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,predicted_classes, Y_test_reshape,start_time,iteration,matrixFlat,
             classificationReport)   

## Transform confusion matrix

In [9]:
def convertMatrix(matrix):
    global flatMatrix
    array=[]
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            array.append(matrix[i][j])
    flatMatrix = np.array(array)
    return flatMatrix

## Save data

In [10]:
def saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,y_test,test_set,startTime,iteration, matrixFlat,classificationReport):
    df_svm_data=pd.read_csv("CNNDataKFold.tsv", sep="\t")
    df_svm_data_full=pd.read_csv("CNNDataKFoldFull.tsv", sep="\t")

    allData={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat}

    allDataFull={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat,"Train Set Full":y_test,"Test Set Full":test_set}

    
    reportDict=transformReport(classificationReport)
    allData.update(reportDict)
    allDataFull.update(reportDict)
    df_new_data=pd.DataFrame([allData])
    df_new_data_full=pd.DataFrame([allDataFull])

    finalData_svm=pd.concat([df_svm_data,df_new_data])
    finalData_svm_full=pd.concat([df_svm_data_full,df_new_data_full])
    
    finalData_svm=finalData_svm[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]

    finalData_svm_full=finalData_svm_full[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","Train Set Full","Test Set Full","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]

## Transform classification report

In [11]:
def transformReport(classificationReport):
    newDict={}
    for key in classificationReport.keys():
        mainName=str(key)
        if type(classificationReport[key]) != dict:
            name=mainName+" "+key
            newDict[name]=classificationReport[key]
            
        if type(classificationReport[key]) == dict:
            for k in classificationReport[key].keys():
                name=mainName+" "+k
                newDict[name]=classificationReport[key][k]

    return newDict

## Main function

In [12]:
files_binary=["../../..//Corpora/Preprocessed/Binary/LT01_gnd_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/LT02_speechLessing_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/LT03_historicplays_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/MI01_mlsa_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/MI02_germeval_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/MI03_corpusRauh_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/NA01_gersen_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/NA02_gerom_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/NA03_ompc_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/RE01_usage_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/RE03_critics_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/SM01_sb10k_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/SM02_potts_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/SM03_multiSe_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/SM04_gertwittersent_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/SM05_ironycorpus_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/SM06_celeb_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/RE02_scare_Preprocessed_binary_balanced.tsv",
                 "../../..//Corpora/Preprocessed/Binary/RE04_filmstarts_Preprocessed_binary.tsv",
                 "../../..//Corpora/Preprocessed/Binary/RE05_amazonreviews_Preprocessed_binary_balanced.tsv"
                 ]
def main():
    global data, file
    for file in files_binary:
        print("Start for corpora: ", file)
        data=pd.read_csv(file, sep="\t")
        split_data()
        print("Finish")

main()


Start for corpora:  ../../..//Corpora/Preprocessed/Binary/LT01_gnd_Preprocessed_binary.tsv
Epoch 1/4


2022-11-20 11:25:42.703258: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-20 11:25:43.143395: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22349 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:65:00.0, compute capability: 7.5
2022-11-20 11:25:44.506038: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8400




2022-11-20 11:25:44.942239: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-11-20 11:25:44.943060: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-11-20 11:25:44.943076: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2022-11-20 11:25:44.943890: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-11-20 11:25:44.943935: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:67.57
26
------------------
11
0.6756756756756757
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:62.16
37
------------------
0
0.6216216216216216
Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:61.11
36
------------------
0
0.6111111111111112
Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:61.11
36
------------------
0
0.6111111111111112
Finish
Start for corpora:  ../../..//Corpora/Preprocessed/Binary/LT02_speechLessing_Preprocessed_binary.tsv
Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:65.03
143
------------------
0
0.6503496503496503
Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:65.03
143
------------------
0
0.6503496503496503
Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:64.34
143
------------------
0
0.6433566433566433
Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 