# Import Data

In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

from transformers import BertTokenizer, BertModel
import torch

from keras.models import Sequential
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
from sklearn.model_selection import KFold

import tensorflow as tf
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Input
from keras.layers import Flatten
from keras.layers import LSTM, Embedding, Dropout, SimpleRNN, Bidirectional
from keras import backend as K

In [None]:
df = pd.read_csv("../data/labeled/combined.csv")
electronics = df.groupby(df.category).get_group("Electronics")
pet = df.groupby(df.category).get_group("Pet supplies")
baby = df.groupby(df.category).get_group("Baby")
sports = df.groupby(df.category).get_group("Sport outdoors")

## Create a balanced Dataset

### Truncate Sentences to 50 words

In [None]:
def truncate_sentence(sentence, max_words=50):
    return ' '.join(sentence.split()[:max_words])

### Split Dataframe to test & training Set

## Initialize BERT Model

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

bert_model = BertModel.from_pretrained('bert-large-uncased')
    
def get_BERT_subwords(text):
    input_ids = bert_tokenizer(text, truncation=True, return_tensors="pt")
    subwords = bert_tokenizer.tokenize(text)
    return subwords
    
def create_BERT_vectors(text):
    #input_ids = tokenizer(text, max_length=50, padding='max_length', truncation=True, return_tensors="pt")
    input_ids = bert_tokenizer(text, max_length=100, padding='max_length', truncation=True, return_tensors="pt")
    output = bert_model(**input_ids)
    
    final_layer = output.last_hidden_state
    return final_layer


### Check max length of tokens in dataset

In [None]:
def check_max_length():
    max_token_len = 0
    longest_text = ""
    for sentence in df["sentence"].values:
        tokens = get_BERT_subwords(sentence)
        token_len = len(tokens)
        if token_len > max_token_len:
            max_token_len = token_len
            longest_text  = sentence
    print(max_token_len)
    print(longest_text)

# Vectorize Data

## Initialize XLNet Model

In [None]:
from transformers import XLNetTokenizer, XLNetModel

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
xlnet_model = XLNetModel.from_pretrained('xlnet-large-cased')

def create_XLNet_vectors(text):
    inputs = xlnet_tokenizer(text, max_length=100, padding='max_length', return_tensors="pt")
    outputs = xlnet_model(**inputs)

    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states

def get_XLNet_subwords(text):
    input_ids = xlnet_tokenizer(text, truncation=True, return_tensors="pt")
    subwords = xlnet_tokenizer.tokenize(text)
    return subwords

## Initialize Glove Model

In [None]:
import torch
import torchtext
import tensorflow as tf
import numpy as np

glove = torchtext.vocab.GloVe(name="6B", # trained on Wikipedia 2014 corpus
                              dim=300)   # embedding size = 100

def create_GLOVE_vector(text):
    text_glove = text.split()
    vector = glove[text_glove[0]]
    final_layer = vector[None, :]
   
    
    for i in range(len(text_glove)-1):
        vector = glove[text_glove[i+1]]
        vector = vector[None, :]
        final_layer = tf.concat([final_layer, vector], axis=0)

    if final_layer.shape[0] != 50:
        difference = 50 - final_layer.shape[0]
        final_layer= np.pad(final_layer, ((0, difference),(0,0)))
        final_layer = final_layer[None,:,:]
    else:
        final_layer = tf.expand_dims(final_layer, axis=0)
    return final_layer


## Concatenate Functions

### Concatenate BERT Subwords

In [None]:
i = 0
word_count = 1
next_vector = False

def conc_BERT_subwords(text):
    vector = create_BERT_vectors(text)
    subwords = get_BERT_subwords(text)
    sum_vector = torch.empty((1,768))
    
    i = 0
    word_count = 1
    max_length = 50 ## Ab wo soll gecutted werden? Notwendig durch verschiedene Subwords von XLN/BERT
    
    next_vector = False
    #print(vector.size())
    #print(subwords)
    while i < len(subwords):
        #print("Word: ", subwords[i])
        #print("Index: ", i)

        ## Prüfe ob dieses Wort mit # beginnt
        if (subwords[i].startswith('#') == True):
            word_count = word_count + 1

            #Prüfe ob es das letzt Wort im Array ist
            if (i != len(subwords)-1):
                ## Prüfe ob nächstes Wort mit # beginnt
                if (subwords[i+1].startswith('#')== True):
                    next_vector = True
                else:
                    next_vector = False
            else:
                ## Wenn es das letzt Wort ist, kann das nächste Wort nicht mit # anfangen
                next_vector = False            

            if next_vector == True:            
                ## Prüfe ob nächstes Wort das erste der Subwords ist
                if (word_count == 2):
                    sum_vector = vector[0][i] + vector[0][i+1]
                else:
                    sum_vector = sum_vector + vector[0][i+1]

            ## Wenn das nächste Wort nicht mit # beginnt        
            else:
                if (word_count == 2):
                    sum_vector = vector[0][i] + vector[0][i+1]
                else:
                    sum_vector = sum_vector + vector[0][i+1]
                sum_vector = sum_vector / word_count
                vector[0][i] = sum_vector
                #print("Vektor [0][",i,"] ersetzt")
                word_count = 1

            ## Entferne Vektor der gerade dazugerechnet wurde
            vector = torch.cat((vector[:,:i+1],vector[:,i+2:]),1) 
            #print(subwords[i], " deleted")
            #print("Vector [0][", i+1 ,"] deleted")
            del subwords[i]
            i = i-1
            
        i = i+1
        
    #print(subwords)
    vector = vector[:,:max_length]
    #print(vector.size())
    return vector

### Concatenate XLNet Subwords

In [None]:
i = 0
word_count = 1
next_vector = False

def conc_XLNet_subwords(text):
    vector = create_XLNet_vectors(text)
    subwords = get_XLNet_subwords(text)
    sum_vector = torch.empty((1,768))

    i = 0
    word_count = 1
    next_vector = False
    max_length = 50 ## Ab wo soll gecutted werden? Notwendig durch verschiedene Subwords von XLN/BERT

    #print(vector.size())
    #print(subwords)
    while i < len(subwords):
        #print("Word: ", subwords[i])
        #print("Index: ", i)

        ## Prüfe ob dieses Wort nicht mit ▁ beginnt
        if (subwords[i].startswith('▁') == False):
            word_count = word_count + 1

            #Prüfe ob es das letzt Wort im Array ist
            if (i != len(subwords)-1):
                ## Prüfe ob nächstes Wort mit ▁ beginnt
                if (subwords[i+1].startswith('▁')== False):
                    next_vector = True
                else:
                    next_vector = False
            else:
                ## Wenn es das letzt Wort ist, kann das nächste Wort nicht mit ▁ anfangen
                next_vector = False            

            if next_vector == True:            
                ## Prüfe ob nächstes Wort das erste der Subwords ist
                if (word_count == 2):
                    sum_vector = vector[0][i] + vector[0][i+1]
                else:
                    sum_vector = sum_vector + vector[0][i+1]

            ## Wenn das nächste Wort nicht mit ▁ beginnt        
            else:
                if (word_count == 2):
                    sum_vector = vector[0][i] + vector[0][i+1]
                else:
                    sum_vector = sum_vector + vector[0][i+1]
                sum_vector = sum_vector / word_count
                vector[0][i] = sum_vector
                #print("Vektor [0][",i,"] ersetzt")
                word_count = 1

            ## Entferne Vektor der gerade dazugerechnet wurde
            vector = torch.cat((vector[:,:i+1],vector[:,i+2:]),1) 
            #print(subwords[i], " deleted")
            #print("Vector [0][", i+1 ,"] deleted")
            del subwords[i]
            i = i-1
            
        i = i+1
    #print(subwords)
    vector = vector[:,:max_length]
    #print(vector.size())

    return vector

In [None]:
def binary_focal_loss(gamma=2., alpha=.25):
    """
    Binary form of focal loss.
      FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t)
      where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, respectively.
    References:
        https://arxiv.org/pdf/1708.02002.pdf
    Usage:
     model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"], optimizer=adam)
    """

    def binary_focal_loss_fixed(y_true, y_pred):
        """
        :param y_true: A tensor of the same shape as `y_pred`
        :param y_pred:  A tensor resulting from a sigmoid
        :return: Output tensor.
        """
        y_true = tf.cast(y_true, tf.float32)
        # Define epsilon so that the back-propagation will not result in NaN for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        # y_pred = y_pred + epsilon
        # Clip the prediciton value
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        # Calculate p_t
        p_t = tf.where(K.equal(y_true, 1), y_pred, 1 - y_pred)
        # Calculate alpha_t
        alpha_factor = K.ones_like(y_true) * alpha
        alpha_t = tf.where(K.equal(y_true, 1), alpha_factor, 1 - alpha_factor)
        # Calculate cross entropy
        cross_entropy = -K.log(p_t)
        weight = alpha_t * K.pow((1 - p_t), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.mean(K.sum(loss, axis=1))
        return loss

    return binary_focal_loss_fixed

df_s_train = df.head(10)
df_s_test = df.head(10)

df_s_train['vector'] = df_s_test.apply(lambda x: generate_conc_vector(x['sentence']), axis=1)
df_s_test['vector'] = df_s_test.apply(lambda x: generate_conc_vector(x['sentence']), axis=1)

In [None]:
def evaluation(train_df, test_df):
    train_df['sentence'] = train_df.sentence.apply(truncate_sentence)
    test_df['sentence'] = test_df.sentence.apply(truncate_sentence)
    
    x_train = np.asarray(train_df.sentence)
    x_test = np.asarray(test_df.sentence)
    y_train = np.asarray(train_df.label)
    y_test = np.asarray(test_df.label)
            
    # bert
    print("-- starting bert --")
    bert_vectors = conc_BERT_subwords(x_train[0]).detach().numpy()
    for i in tqdm(range(len(x_train)-1)):
        vector = conc_BERT_subwords(x_train[i+1]).detach().numpy()
        bert_vectors = tf.concat([bert_vectors, vector], axis=0)
    
    bert_vectors_test = conc_BERT_subwords(x_test[0]).detach().numpy()
    for i in tqdm(range(len(x_test)-1)):
        vector = conc_BERT_subwords(x_test[i+1]).detach().numpy()
        bert_vectors_test = tf.concat([bert_vectors_test, vector], axis=0)
        
    print("-- starting Xl-NET --")
    XLNet_vectors = conc_XLNet_subwords(x_train[0]).detach().numpy()
    for i in tqdm(range(len(x_train)-1)):
        vector = conc_XLNet_subwords(x_train[i+1]).detach().numpy()
        XLNet_vectors = tf.concat([XLNet_vectors, vector], axis=0)

    XLNet_vectors_test = conc_XLNet_subwords(x_test[0]).detach().numpy()
    for i in tqdm(range(len(x_test)-1)):
        vector = conc_XLNet_subwords(x_test[i+1]).detach().numpy()
        XLNet_vectors_test = tf.concat([XLNet_vectors_test, vector], axis=0)
        
   
        
     # glove
    print("-- starting glove --")   
    glove_vectors = []
    for i in tqdm(range(len(x_train))):
        vector = create_GLOVE_vector(x_train[i])
        if len(glove_vectors) == 0:
            glove_vectors = vector
        else:
            glove_vectors = tf.concat([glove_vectors, vector], axis=0)
    
    glove_vectors_test = []
    for i in tqdm(range(len(x_test))):
        vector = create_GLOVE_vector(x_test[i])
        if len(glove_vectors_test) == 0:
            glove_vectors_test = vector
        else:
            glove_vectors_test = tf.concat([glove_vectors_test, vector], axis=0)
        
    print("-- concat vectors --")
    conc_vector = tf.concat([glove_vectors, bert_vectors, XLNet_vectors], axis=2)
    print(type(conc_vector))
    print(conc_vector.shape)
    print(type(y_train))

    conc_vector_test = tf.concat([glove_vectors_test, bert_vectors_test, XLNet_vectors_test], axis=2)
    conc_vector_test.shape
    
    print("-- create model --")
    model = Sequential()
    model.add(Input(shape=(50,2348) ,name='Conc-Vector'))
    model.add(Dropout(0.1, input_shape=(50,2348)))
    model.add(SimpleRNN(512, activation='relu', return_sequences=True))
    model.add(Dropout(0.1))
    model.add(Bidirectional(LSTM(256)))
    model.add(Dense(1, activation='sigmoid'))
    #model.summary()
    
    METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
    ]
    
    model.compile(optimizer='adam',
                  loss=[binary_focal_loss(alpha=.8, gamma=2)],
                  metrics=METRICS)
    
    model.fit(conc_vector, y_train, epochs=7, validation_data=(conc_vector_test, y_test))
    y_preds = model.predict(conc_vector_test)
    y_preds = np.round(y_preds)
    
    f1 = f1_score(y_test,y_preds, average="macro")
    acc = accuracy_score(y_test, y_preds)
    mcc = matthews_corrcoef(y_test,y_preds)
    return f1, acc, mcc
    

### load data and evaluate

In [None]:
df = pd.read_csv("../data/labeled/combined.csv")
electronics = df.groupby(df.category).get_group("Electronics")
pet = df.groupby(df.category).get_group("Pet supplies")
baby = df.groupby(df.category).get_group("Baby")
sports = df.groupby(df.category).get_group("Sport outdoors")

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 2)
data = []

for train_index , test_index in kf.split(baby):
    data_df = baby
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["baby",f1,acc, mcc])
    
for train_index , test_index in kf.split(pet):
    data_df = pet
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["pet",f1,acc, mcc])

for train_index , test_index in kf.split(sports):
    data_df = sports
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["sports",f1,acc, mcc])
    
for train_index , test_index in kf.split(electronics):
    data_df = electronics
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["electronics",f1,acc, mcc])
    
df_result = pd.DataFrame(data, columns = ['category', 'f1-score', 'accuracy', 'matthews-corr'])

In [None]:
df_result.groupby(df_result.category).mean()

In [None]:
df_result.to_csv('../results/ensemble-REE.csv', index=False)