# Convolutional Neural Network / Ternary
Name: Niklas Donhauser

**Source**

[1] sklearn https://scikit-learn.org/stable/ <br>
[2] re https://docs.python.org/3/library/re.html <br>
[3] pandas https://pandas.pydata.org/ <br>
[4] time https://docs.python.org/3/library/time.html <br>
[5] numpy https://numpy.org/ <br>
[6] keras https://keras.io/ <br>

**Useful links:**
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html <br>
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html <br>

## Import libraries

In [1]:
import numpy as np 
import pandas as pd 
import re
import time


from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, Flatten,GlobalMaxPool1D
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

## Tokenize

In [7]:
def tokenize_data(X_train, X_test):
    global vocab_size, max_length, X_train_encoded, X_test_encoded
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(X_train)
    
    vocab_size=len(tokenizer.word_index) + 1
    max_length=max([len(row.split()) for row in X_train])
    
    X_train_encoded = encode_reviews(tokenizer, max_length, X_train)
    X_test_encoded = encode_reviews(tokenizer, max_length, X_test)
    

## Encode

In [3]:
def encode_reviews(tokenizer, max_length, X_train_processed):
    encoded=tokenizer.texts_to_sequences(X_train_processed)
    
    padded=pad_sequences(encoded, maxlen=max_length, padding="post")
    
    return padded

## Setup model

In [4]:
def setup_model():
    global model
    model=Sequential()
    model.add(Embedding(vocab_size, 100,input_length=max_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPool1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) 

## Split data

In [5]:
def split_data():
    global train_index, test_index
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=2)
    iteration=0
    data["sentiment"].replace(to_replace="positive", value=1, regex=True, inplace=True)
    data["sentiment"].replace(to_replace="negative", value=0, regex=True, inplace=True)
    data["sentiment"].replace(to_replace="neutral", value=2, regex=True, inplace=True)
    for train_index, test_index in skf.split(data["preprocessedData"],data["sentiment"]):
        iteration=iteration+1
        start_time=time.time()
        
        X_train, X_test, Y_train, Y_test=data["preprocessedData"][train_index], data["preprocessedData"][test_index], data["sentiment"][train_index], data["sentiment"][test_index]
        tokenize_data(X_train, X_test)
        setup_model()
        le = LabelEncoder()
        Y_train = le.fit_transform(Y_train)
        Y_test = le.transform(Y_test)
        
        train_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time)

        

## Train model

In [6]:
def train_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time):
    global batch_size, verbose 
    batch_size = 32
    epochs = 4
    verbose = 1 
    model.fit(X_train_encoded, Y_train, epochs = epochs, batch_size = batch_size, verbose = verbose)
    predict_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time)

## Predict model

In [7]:
def predict_model(X_train_encoded, X_test_encoded, Y_train, Y_test, iteration, start_time):
    predicted_classes = []

    _,acc= model.evaluate(X_test_encoded, Y_test, verbose=0)
    
    prediction = model.predict(X_test_encoded)
    for items in range(len(prediction)):
        test = prediction[items]
        predicted_classes.append(np.argmax(test))

    accuracy=accuracy_score(Y_test, predicted_classes)
    
    Y_test=Y_test.tolist()
    get_data(predicted_classes, Y_train, Y_test, iteration, start_time)

## Get Data

In [8]:
def get_data(predicted_classes, Y_train, Y_test_reshape, iteration, start_time):
    corporaType=""
    shortcut=""
    name=""
    totalTextUnits=0
    totalTestUnits=0
    totalTrainUnits=0
    accuracy=0
    f1_macro=0
    precision_macro=0
    recall_macro=0
    f1_micro=0
    precision_micro=0
    recall_micro=0
    matrix=[]
    f1_binary=0
    precision_binary=0
    recall_binary=0

    splitString=re.split("_|/",file)
    corporaType=splitString[5]
    shortcut=splitString[6]
    name=splitString[7]

    totalTime=time.time()-start_time

    totalTextUnits=len(data.index)
    totalTestUnits=len(predicted_classes)
    totalTrainUnits=len(Y_train)

    accuracy=accuracy_score(Y_test_reshape, predicted_classes)
    
    f1_macro=f1_score(Y_test_reshape, predicted_classes, average="macro", labels=[1,0,2])
    precision_macro=precision_score(Y_test_reshape, predicted_classes, average="macro", labels=[1,0,2])
    recall_macro=recall_score(Y_test_reshape, predicted_classes, average="macro", labels=[1,0,2])

    f1_micro=f1_score(Y_test_reshape, predicted_classes, average="micro")
    precision_micro=precision_score(Y_test_reshape, predicted_classes, average="micro", labels=[1,0,2])
    recall_micro=recall_score(Y_test_reshape, predicted_classes, average="micro", labels=[1,0,2])

    matrix=confusion_matrix(Y_test_reshape, predicted_classes, labels=[1,0,2])

        
    matrixFlat=convertMatrix(matrix)
    target_names = [0,1,2]
    classificationReport=classification_report(Y_test_reshape, predicted_classes, target_names=target_names, output_dict=True)
    
    
    saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,
             f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,predicted_classes, Y_test_reshape,start_time,iteration,matrixFlat,
             classificationReport)   

## Transform confusion matrix

In [9]:
def convertMatrix(matrix):
    global flatMatrix
    array=[]
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            array.append(matrix[i][j])
    flatMatrix = np.array(array)
    return flatMatrix

## Save data

In [10]:
def saveData(corporaType,shortcut,name,totalTime,totalTextUnits,totalTestUnits,totalTrainUnits,accuracy,f1_macro,precision_macro,recall_macro,f1_micro,precision_micro,recall_micro,matrix,f1_binary,precision_binary,recall_binary,y_test,test_set,startTime,iteration, matrixFlat,classificationReport):
    #print(matrixFlat)
    df_svm_data=pd.read_csv("CNNDataKFold.tsv", sep="\t")
    df_svm_data_full=pd.read_csv("CNNDataKFoldFull.tsv", sep="\t")

    allData={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat}

    allDataFull={"Iteration":iteration,"Shortcut":shortcut,"Name":name,"Type":corporaType,"Time":totalTime,"Total Length":totalTextUnits,"Training Set":totalTrainUnits,"Test Set":totalTestUnits,"Accuracy":accuracy,"Precision Macro":precision_macro,
             "Precision Micro":precision_micro,"Precision Binary":precision_binary,"Recall Macro":recall_macro,"Recall Micro":recall_micro,"Recall Binary":recall_binary,"F1 Macro":f1_macro,
            "F1 Micro":f1_micro,"F1 Binary":f1_binary,"Matrix":matrixFlat,"Train Set Full":y_test,"Test Set Full":test_set}

    
    reportDict=transformReport(classificationReport)
    allData.update(reportDict)
    allDataFull.update(reportDict)
    df_new_data=pd.DataFrame([allData])
    df_new_data_full=pd.DataFrame([allDataFull])

    finalData_svm=pd.concat([df_svm_data,df_new_data])
    finalData_svm_full=pd.concat([df_svm_data_full,df_new_data_full])
    
    finalData_svm=finalData_svm[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","2 f1-score","2 support","2 precision","2 recall","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]

    finalData_svm_full=finalData_svm_full[["Iteration","Shortcut","Name","Type","Time","Total Length","Training Set","Test Set","Accuracy","Precision Macro",
              "Precision Micro","Precision Binary","Recall Macro","Recall Micro","Recall Binary","F1 Macro",
            "F1 Micro","F1 Binary","Matrix","Train Set Full","Test Set Full","0 precision","0 recall","0 f1-score",
            "0 support","1 precision","1 recall","1 f1-score","1 support","2 f1-score","2 support","2 precision","2 recall","accuracy accuracy",
            "macro avg precision","macro avg recall","macro avg f1-score","macro avg support","weighted avg precision","weighted avg recall",
            "weighted avg f1-score","weighted avg support"]]


## Transform classification report

In [11]:
def transformReport(classificationReport):
    #print(classificationReport)
    newDict={}
    for key in classificationReport.keys():
        mainName=str(key)
        if type(classificationReport[key]) != dict:
            name=mainName+" "+key
            newDict[name]=classificationReport[key]
            #print(key, "->",classificationReport[key],"-",)
            
        if type(classificationReport[key]) == dict:
            for k in classificationReport[key].keys():
                name=mainName+" "+k
                newDict[name]=classificationReport[key][k]
                #print(k,"->",classificationReport[key][k])
    #print("--------------")
    #print("DICT:",newDict)
    return newDict

## main function

In [12]:

def main():
    global data, file
    for file in files_ternary:
        print("Start for corpora: ", file)
        data=pd.read_csv(file, sep="\t")
        split_data()
        print("Finish")

main()


Start for corpora:  ../../../Corpora/Preprocessed/Ternary/RE04_filmstarts_Preprocessed_ternary_all.tsv
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3069, 100)         17811800  
                                                                 
 conv1d (Conv1D)             (None, 3065, 128)         64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)   

2022-11-20 14:27:15.543627: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-20 14:27:15.960093: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22844 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:65:00.0, compute capability: 7.5


None
Epoch 1/4


2022-11-20 14:27:17.602580: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8400


   1/1651 [..............................] - ETA: 46:37 - loss: 1.0925 - accuracy: 0.2812

2022-11-20 14:27:18.088952: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-11-20 14:27:18.089675: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-11-20 14:27:18.089692: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2022-11-20 14:27:18.090523: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-11-20 14:27:18.090572: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy:72.13
[1, 1, 1, 1, 2, 1, 1, 1, 1, 0]
0.7212630622444344
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3069, 100)         17903600  
                                                                 
 conv1d_1 (Conv1D)           (None, 3065, 128)         64128     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)    