# Imports

In [None]:
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import keras.backend as K
import re
import string
import transformers
import sklearn
import sys
from sklearn.model_selection import train_test_split
import pickle
import json

# Data Importation

In [None]:
# Dataset split already performed in NLP Set Splitter
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('val.csv')

# Text Cleaning

In [None]:
def clean_text(text, lower= False, remove_numbers= True):
    """
    Convert raw text to text that can be better utilized by our encoder
    :param text: Raw text data from survey 
    :param lower: Whether to convert all text to lower-case
    :param remove_numbers: Whether to delete all numeric values from the text
    :return: modified text to be fed into encoder
    """
    #convert to lowercase if desired
    if lower:
        text= text.lower()
        
    #remove garbage characters
    text = re.sub(r'[^\w\s]', '',text)
    
    #remove numbers if desired
    if remove_numbers:
        text= re.sub(r'\d+', '', text)
    text = re.sub(r']\s+', ' ', text).strip()
    return text
    

# Jaccard Loss Function

In [None]:
def jaccard_loss(y_true, y_pred):
    """
    Compute the soft Jaccard loss for model training
    :param y_true: Ground truth tensor (label-encoded).
    :param y_pred: Prediction tensor (logits or probabilities).
    :return: Jaccard loss.
    """
    #get predicted probabilities for each class
    y_pred = tf.nn.softmax(y_pred, axis=-1)
    #cast all the values to floats to ensure usability
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    #actual jaccard calculations
    intersection = tf.reduce_sum(y_true * y_pred, axis=1)
    union = tf.reduce_sum(y_true + y_pred, axis=1) - intersection
    jaccard = (intersection) / (union +1e-10)
    jaccard_loss = 1 - jaccard 

    return(jaccard_loss)

# Training Function

In [1]:
def train_mods(train, valid, num_classes, learning_rate, opt, model, class_weights):
    """
    Perform downstream training of BERT or DistilBERT using selected hyperparameters
    :param train: set of training data, including text and labels
    :param valid: set of validation data, including text and labels
    :param learning_rate: learning rate to be used by the optimizer for training
    :param opt:optimizer to be used for training
    :param model: NLP model to be used (BERT or DistilBERT)
    :param class_weights: dictionary of class weights used in training
    :return: model history, numerical representation of validation text data, 
       one-hot encoded matrix of validation labels, and a trained model
    """
    #create model instantiations based off what was passed in
    if model == "BERT":
        model_name= "bert-base-uncased"
        tokenizer = BertTokenizer.from_pretrained(model_name)
        mod = TFBertForSequenceClassification.from_pretrained(model_name, num_labels = num_classes)
    elif model == "DISTILBERT":
        model_name= "distilbert-base-uncased"
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        mod = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels = num_classes)
    
    
    #initializing empty arrays
    train_input_ids = []
    train_attention_masks = []

    #clean each response and seperate the numerical representations from their masks
    for sentence in train['text']:
        cleaned = clean_text(sentence)
        encoded = tokenizer.encode_plus(cleaned, max_length = 512, truncation = True, padding='max_length', return_attention_mask = True)
        train_input_ids.append(encoded['input_ids'])
        train_attention_masks.append(encoded['attention_mask'])
    
    #transforming the data into a more usable form
    train_input_ids=np.array(train_input_ids)
    train_attention_masks =np.array(train_attention_masks)

    
    #one hot encoding our labels
    train_labels = train['labels']

    train_labels = [list(map(int, str(item).split(','))) for item in train_labels]
    train_labels = pad_sequences(train_labels, maxlen=num_classes)
    train_labels = np.array(train_labels)
    
    result = np.zeros(train_labels.shape, dtype=int)

    for idx, val in enumerate(range(1, num_classes)):
        rows_with_val = np.where(train_labels == val)[0]
        for row in rows_with_val:
            result[row, idx+1] = 1

    y_train = result
    
    #this section repeats the same thing we just did for the training data, just for the validation stuff
    val_input_ids = []
    val_attention_masks = []

    for sentence in valid['text']:
        cleaned = clean_text(sentence)
        encoded = tokenizer.encode_plus(cleaned, max_length = 512, truncation = True, padding='max_length', return_attention_mask = True)
        val_input_ids.append(encoded['input_ids'])
        val_attention_masks.append(encoded['attention_mask'])
    
    val_input_ids=np.array(val_input_ids)
    val_attention_masks =np.array(val_attention_masks)
    
    val_labels = valid['labels']
   
    val_labels = [list(map(int, str(item).split(','))) for item in val_labels]
    val_labels = pad_sequences(val_labels, maxlen=num_classes)
    val_labels = np.array(val_labels)
    
    result = np.zeros(val_labels.shape, dtype=int)

    for idx, val in enumerate(range(1, num_classes)):
        rows_with_val = np.where(val_labels == val)[0]
        for row in rows_with_val:
            result[row, idx+1] = 1
            
    #renaming some variables to make them read easier
    y_val = result
    
    
    x_val = val_input_ids
    mask_val = val_attention_masks
    
    x_train = train_input_ids
    mask_train = train_attention_masks
    
    #creating our optimizer based on passed in parameters
    if opt == "ADAM":
        optimizer = tf.keras.optimizers.Adam(learning_rate =learning_rate)
    elif opt == "SGD":
        optimizer = tf.keras.optimizers.SGD(learning_rate =learning_rate)
        
    elif opt == "RMSPROP":
        optimizer = tf.keras.optimizers.RMSprop(learning_rate =learning_rate)
    mod.compile(optimizer = optimizer, loss =jaccard_loss)
    
    #early stopping in case the model just really isn't working
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 20)
    
    #actual training step
    history = mod.fit([x_train, mask_train], y_train, batch_size =8, validation_data = ([x_val, mask_val], y_val) , callbacks = [early_stopping], epochs= 250, class_weight = class_weights)
    return(history, x_val, y_val, mask_val, mod)

# Jaccard Metric

In [None]:
def jaccard_similarity(setA, setB):
    """
    Compute the Jaccard similarity for two sets of labels
    :param setA: set of true labels
    :param setB: set of predicted labels
    :return: Jaccard Similarity Score
    """
    #making the sets usable
    setA = set(map(int, setA.strip("{}").split(',')))
    setB = set(map(lambda x: int(float(x)), setB.strip("{}").split(',')))
    
    #jaccard calculations
    intersection = len(setA.intersection(setB))
    intersection1 = setA.intersection(setB)
    union = len(setA.union(setB))
    return intersection/union if union != 0 else 0


# Testing Function

In [None]:
def model_testing(mod, x_test, y_test, mask_test, num_classes):
    """
    Check how the models are performing on validation data or test data
    :param mod: the tensorflow model we're interested in testing  
    :param x_test: numerical representation of text data we're testing
    :param y_test: one-hot encoded matrix of true labels
    :param mask_test: encoded masks of the text data, necessary for BERT to operate
    :param num_classes: number of factors used in that category during hand-labelling
    :return: list of Jaccard Similarity Scores for each possible level of acception for predictions
    """
    #find probabilities of outputs
    outputs =mod(x_test, attention_mask = mask_test)
    logits = outputs.logits
    probs = tf.nn.softmax(logits, axis = -1)
    
    #creating a range of values to make this display a bit better
    vals = [i/100 for i in range (1,100)]
    avg_jac=[]
    for i in vals:
        mask = probs > i
        inds_above = tf.where(mask)
        probs_above = tf.gather_nd(probs, inds_above)
        if len(inds_above > 0):

            #creating dataframes to do our matching
            pred_check = pd.DataFrame(y_test)

            preds_df = pd.DataFrame(inds_above)
            preds_df.columns = ['response_index', 'predicted_label']

            #some labelling
            pred_check['response_index'] = range(len(y_test))

            #join tables to match predictions with initial labels
            left_join = pred_check.merge(preds_df, on = 'response_index', how = 'left')
            left_join.fillna(0, inplace=True)

            #joins each individual response's predictions into one
            left_join = left_join.groupby('response_index', as_index = False).agg(lambda x: ','.join(map(str, x)))

            def deduplicate_values(s):
                values = s.split(',')
                unique_values = list(set(values))
                return ','.join(unique_values)

            def combine_values(row):
                return ','.join(row)

            # Apply the function to our columns
            for col in left_join.columns[1:(num_classes + 1)]:
                left_join[col] = left_join[col].apply(deduplicate_values)
            left_join['true_labels'] = left_join.iloc[:, 1:num_classes+1].apply(combine_values, axis=1)
            left_join = left_join.drop(columns=left_join.columns[1:(num_classes + 1)])

            def non_zero_indices(s):
                values = s.split(',')
                indices = [str(i) for i, v in enumerate(values) if v != '0']
                return ','.join(indices)
            left_join['true_labels'] = left_join['true_labels'].apply(non_zero_indices)
            
            left_join['jaccard'] = left_join.apply(lambda row: jaccard_similarity(row['true_labels'], row['predicted_label']), axis = 1)
           
            avg = left_join['jaccard'].mean()
            avg_jac.append(avg)
    return(avg_jac)

# Cleaning Function

In [None]:
def cleaning_func(num_labs, full_df):
    """
    Split datasets into positive, negative, and other sets. Also removes non-responses
    :param num_labs: the number of responses in a given dataset
    :param full_df: the datatset dataframe to be split up
    :return: positive responses, negative responses, other responses, each in their own dataframes with corresponding labels
    """
    #Positive Cleaning, creating the positive set and dropping non-responses
    nlp_train_pos = (full_df[['factors_would_attract_affiliate', 'factors_would_attract_affiliate_labels','factors_did_attract_affiliate','factors_did_attract_affiliate_labels']]).iloc[0:num_labs]

    fwaa = nlp_train_pos[['factors_would_attract_affiliate', 'factors_would_attract_affiliate_labels']]
    fwaa = fwaa.explode('factors_would_attract_affiliate_labels', ignore_index=True)
    fwaa = fwaa[fwaa['factors_would_attract_affiliate_labels'] !='0']
    fwaa_list =fwaa.values.tolist()

    fdaa = nlp_train_pos[['factors_did_attract_affiliate', 'factors_did_attract_affiliate_labels']]
    fdaa = fdaa.explode('factors_did_attract_affiliate_labels', ignore_index=True)#
    fdaa = fdaa[fdaa['factors_did_attract_affiliate_labels'] !='0']
    fdaa_list = fdaa.values.tolist()

    pos_list = fwaa_list + fdaa_list
    pos_train = pd.DataFrame(pos_list)
    pos_train.columns = ['text', 'labels']

    #Negative Cleaning, creating the negative set and dropping non-responses
    nlp_train_negative = (full_df[['changes_keep', 'changes_keep_labels', 'factors_would_deter_affiliate', 'factors_would_deter_affiliate_labels', 'drawbacks_affiliate', 'drawbacks_affiliate_labels', 'factors_prevent_satisfaction', 'factors_prevent_satisfaction_labels']]).iloc[0:num_labs]


    ck = nlp_train_negative[['changes_keep', 'changes_keep_labels']]
    ck = ck.explode('changes_keep_labels', ignore_index=True)
    ck = ck[ck['changes_keep_labels'] !='0']
    ck_list =ck.values.tolist()



    fwda = nlp_train_negative[['factors_would_deter_affiliate', 'factors_would_deter_affiliate_labels']]
    fwda = fwda.explode('factors_would_deter_affiliate_labels', ignore_index=True)
    fwda = fwda[fwda['factors_would_deter_affiliate_labels'] !='0']
    fwda_list =fwda.values.tolist()


    da = nlp_train_negative[['drawbacks_affiliate', 'drawbacks_affiliate_labels']]
    da = da.explode('drawbacks_affiliate_labels', ignore_index=True)
    da = da[da['drawbacks_affiliate_labels'] !='0']
    da_list =da.values.tolist()



    fps = nlp_train_negative[['factors_prevent_satisfaction', 'factors_prevent_satisfaction_labels']]
    fps = fps.explode('factors_prevent_satisfaction_labels', ignore_index=True)
    fps = fps[fps['factors_prevent_satisfaction_labels'] !='0']
    fps_list =fps.values.tolist()


    neg_list = ck_list + fwda_list + da_list + fps_list
    neg_train = pd.DataFrame(neg_list)
    neg_train.columns = ['text', 'labels']

    #Other Cleaning, creating the other set and dropping non-responses
    nlp_train_other=(full_df[['factors_consider_during_eval', 'factors_consider_during_eval_labels', 'additional_comments', 'additional_comments_labels']]).iloc[0:num_labs]

    fcde = nlp_train_other[['factors_consider_during_eval', 'factors_consider_during_eval_labels']]
    fcde = fcde.explode('factors_consider_during_eval_labels', ignore_index=True)
    fcde = fcde[fcde['factors_consider_during_eval_labels'] !='0']
    fcde_list =fcde.values.tolist()

    ac = nlp_train_other[['additional_comments', 'additional_comments_labels']]
    ac = ac.explode('additional_comments_labels', ignore_index=True)
    ac = ac[ac['additional_comments_labels'] !='0']
    ac_list =ac.values.tolist()

    other_list = ac_list + fcde_list
    other_train = pd.DataFrame(other_list)
    other_train.columns = ['text', 'labels']
    

    
    return(pos_train, neg_train, other_train)

# Class Weight Dicts


In [None]:
#weights calculated based on the occurence of these factors in the training set
pos_class_weights = {
    0:1,
    1: 267/42,
    2:267/57,
    3:267/60,
    4:267/31,
    5:267/10,
    6:267/23,
    7:267/9,
    8:267/19,
    9:267/13,
    10:267/3    
}

neg_class_weights = {
    0:1,
    1: 821/101,
    2:821/59,
    3:821/91,
    4:821/133,
    5:821/65,
    6:821/42,
    7:821/102,
    8:821/109,
    9:821/26,
    10: 821/26,
    11: 821/38,
    12: 821/12,
    13: 821/9,
    14: 821/8,
}
other_class_weights = {
    0:1,
    1: 678/69,
    2: 678/71,
    3: 678/32,
    4: 678/90,
    5: 678/70,
    6: 678/92,
    7: 678/41,
    8: 678/56,
    9: 678/98,
    10: 678/18,
    11: 678/35,
    12: 678/6,
}


# Splitting Data

In [None]:
#splitting each of our sets up into smaller, more usable sets
pos_train, neg_train, other_train = cleaning_func(250, train)
pos_val, neg_val, other_val = cleaning_func(50, val)
pos_test, neg_test, other_test = cleaning_func(50, test)




# Model Training and Validation

In [None]:
outputs = {}
max_len = 100

#creating and trianing a model in each category using our best model hyperparameters found in our search
# makes a pickle file of each model's performance at accepted probability cutoffs from 0 to .99

pos_1 = "weightBERTADAM0.00001"+ str(num_labs)
mod_name = pos_1
pos1_hist, pos1_x_val, pos1_y_val, pos1_mask_val, pos1_mod=train_mods(pos_train,pos_val, 11, 0.00001, 'ADAM', 'BERT', pos_class_weights)
pos1_mod.save(mod_name)
pos1_jac = model_testing(pos1_mod, pos1_x_val, pos1_y_val, pos1_mask_val, 11)
pos1_jac.extend([0]*(max_len-len(pos1_jac)))

pos_2 = "weightDISTILBERTADAM0.00001"+ str(num_labs)
mod_name = pos_2
pos2_hist, pos2_x_val, pos2_y_val, pos2_mask_val, pos2_mod=train_mods(pos_train,pos_val, 11, 0.00001, 'ADAM', 'DISTILBERT', pos_class_weights)
pos2_mod.save(mod_name)
pos2_jac = model_testing(pos2_mod, pos2_x_val, pos2_y_val, pos2_mask_val, 11)
pos2_jac.extend([0]*(max_len-len(pos2_jac)))

pos_dat = {pos_2:pos2_jac}
pos_output = pd.DataFrame(pos_dat)
outputs["Positive"] = pos_output

neg_1 = "weightDISTILBERTRMSPROP0.00005"+ str(num_labs)
mod_name = neg_1
neg1_hist, neg1_x_val, neg1_y_val, neg1_mask_val, neg1_mod=train_mods(neg_train,neg_val, 15, 0.00005, 'RMSPROP', 'DISTILBERT', neg_class_weights)
neg1_mod.save(mod_name)
neg1_jac = model_testing(neg1_mod, neg1_x_val, neg1_y_val, neg1_mask_val, 15)
neg1_jac.extend([0]*(max_len-len(neg1_jac)))

neg_2 = "weightDISTILBERTRMSPROP0.00002"+ str(num_labs)
mod_name = neg_2
neg2_hist, neg2_x_val, neg2_y_val, neg2_mask_val, neg2_mod=train_mods(neg_train,neg_val, 15, 0.00005, 'RMSPROP', 'DISTILBERT', neg_class_weights)
neg2_mod.save(mod_name)
neg2_jac = model_testing(neg2_mod, neg2_x_val, neg2_y_val, neg2_mask_val, 15)
neg2_jac.extend([0]*(max_len-len(neg2_jac)))

neg_dat = {neg_1:neg1_jac, neg_2:neg2_jac}
neg_output = pd.DataFrame(neg_dat)

outputs["Negative"] = neg_output



other_1 = "weightBERTRMSPROP0.0001"+ str(num_labs)
mod_name = other_1
other1_hist, other1_x_val, other1_y_val, other1_mask_val, other1_mod=train_mods(other_train,other_val, 13, 0.0001, 'RMSPROP', 'BERT', other_class_weights)
other1_mod.save(mod_name)
other1_jac = model_testing(other1_mod, other1_x_val, other1_y_val, other1_mask_val, 13)
other1_jac.extend([0]*(max_len-len(other1_jac)))

other_2 = "weightBERTRMSPROP0.00005"+ str(num_labs)
mod_name = other_2
other2_hist, other2_x_val, other2_y_val, other2_mask_val, other2_mod=train_mods(other_train,other_val, 13, 0.00005, 'RMSPROP', 'BERT', other_class_weights)
other2_mod.save(mod_name)
other2_jac = model_testing(other2_mod, other2_x_val, other2_y_val, other2_mask_val, 13)
other2_jac.extend([0]*(max_len-len(other2_jac)))

other_dat = {other_1:other1_jac, other_2:other2_jac}
other_output = pd.DataFrame(other_dat)

outputs["Other"] = other_output



pkl_name  = 'extended_results_weighted_pos_2' + str(num_labs) + '.pkl'
json_name = 'model_histores' + str(num_labs) + '.json'
with open(pkl_name, 'wb') as handle:
    pickle.dump(outputs, handle, protocol=pickle.HIGHEST_PROTOCOL)
