# Imports

In [None]:
import pandas as pd
import tensorflow as tf
import pickle
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Jaccard Loss

In [None]:
def jaccard_loss(y_true, y_pred):
    """
    Compute the soft Jaccard loss for model training
    :param y_true: Ground truth tensor (label-encoded).
    :param y_pred: Prediction tensor (logits or probabilities).
    :return: Jaccard loss.
    """
    #get predicted probabilities for each class
    y_pred = tf.nn.softmax(y_pred, axis=-1)
    
    #cast all the values to floats to ensure usability
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    #actual jaccard calculations
    intersection = tf.reduce_sum(y_true * y_pred, axis=1)
    union = tf.reduce_sum(y_true + y_pred, axis=1) - intersection
    jaccard = (intersection) / (union +1e-10)
    jaccard_loss = 1 - jaccard 

    return(jaccard_loss)

# Mods and Data

In [None]:
#here is where we load our best models and our unlablled data
pos_mod = tf.keras.models.load_model('weight_pos_mod_2', custom_objects = {'jaccard_loss': jaccard_loss})
neg_mod = tf.keras.models.load_model('weight_neg_mod', custom_objects = {'jaccard_loss': jaccard_loss})
other_mod = tf.keras.models.load_model('weight_other_mod', custom_objects = {'jaccard_loss': jaccard_loss})
unlab = pd.read_excel('Data.xlsx')


# Cleaning Function

In [None]:
def splitting_func(num_labs, full_df):
    """
    Split datasets into positive, negative, and other sets, ignoring the already labelled data
    :param num_labs: the number of responses in a given dataset
    :param full_df: the datatset dataframe to be split up
    :return: positive responses, negative responses, other responses, each in their own dataframes
    """
    
    #Positive Cleaning
    pos_full = (full_df[['factors_would_attract_affiliate', 'factors_did_attract_affiliate']]).iloc[num_labs:]

    #Negative Cleaning
    neg_full = (full_df[['changes_keep', 'factors_would_deter_affiliate', 'drawbacks_affiliate', 'factors_prevent_satisfaction']]).iloc[num_labs:]


    #Other Cleaning
    other_full=(full_df[['factors_consider_during_eval','additional_comments']]).iloc[num_labs:]

    
    return(pos_full, neg_full, other_full)

# Call to Cleaning, Splitting up into Categories

In [None]:
pos_full, neg_full, other_full = splitting_func(250, unlab)

# Pre-Processing our Data for Prediction

In [None]:
def bonus_func(column, tokenizer):
    """
    Encode the datasets into ids and masks
    :param column: the column of our dataset that we want to encode, should be applied to all columns
    :param tokenizer: which tokenizer needs to be used to ensure the models receive necessary information
    :return: input id array and attention mask array
    """
    input_ids = []
    attention_masks = []
    
    #almost certain that this if else doesn't do anything anymore, but didn't want to remove in case it breaks something
    #the block of code within the logic is definitely necessary though
    for element in column:
        if pd.isna(element):
            encoded = tokenizer.encode_plus(element, max_length = 512, truncation = True, padding='max_length', return_attention_mask = True)
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        
        else:
            encoded = tokenizer.encode_plus(element, max_length = 512, truncation = True, padding='max_length', return_attention_mask = True)
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])

            

                
               
    input_ids=np.array(input_ids)
    attention_masks =np.array(attention_masks)
    return(input_ids, attention_masks)
    

In [None]:
def pre_proc(model, df):
    """
    Create necessary information regarding our models and call the bonus function to encode our data
    :param model: name of the model being used currently
    :param df: the dataset we want encoded in that model's fashion
    :return: encoded text data, including the ids and corresponding masks
    """
    if model == "BERT":
        model_name= "bert-base-uncased"
        tokenizer = BertTokenizer.from_pretrained(model_name)       
 
    elif model == "DISTILBERT":
        model_name= "distilbert-base-uncased"
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    input_ids = []
    attention_masks = []
    

    for column in df:

        input_ids2, attention_masks2 = bonus_func(df[column], tokenizer)

        input_ids.append(input_ids2)
        attention_masks.append(attention_masks2)

    return(input_ids, attention_masks)

# Calls to Pre-processing

In [None]:
#Calling our pre-processing. This may need to be expanded a touch if not using the same model for every instantiation of a category
pos_ids, pos_masks = pre_proc('DISTILBERT',pos_full)
neg_ids, neg_masks= pre_proc('DISTILBERT', neg_full)
other_ids, other_masks = pre_proc('BERT', other_full)

# Lets Get a Lil Funky (necessary comment at the top of the first block in this section)

In [None]:
#This is where we have the models make predictions on the data. BERT models require the token type ids in order to function
#DistilBERT models do not. Each Category has a block here,  doing the same thing as this one. 
#Results are dumped into a pickle file for each category, containing a dictionary of model outputs for each question
pos_ids1 = pos_ids[0]
pos_masks1=pos_masks[0]
tensor_input_ids = tf.convert_to_tensor(pos_ids1)
tensor_attention_masks = tf.convert_to_tensor(pos_masks1)
tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
#    'token_type_ids': tensor_token_type_ids
}

pos_output1 = pos_mod.predict(inputs)
pos_ids2 = pos_ids[1]
pos_masks2=pos_masks[1]
tensor_input_ids = tf.convert_to_tensor(pos_ids2)
tensor_attention_masks = tf.convert_to_tensor(pos_masks2)
tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
#    'token_type_ids': tensor_token_type_ids
}

pos_output2 = pos_mod.predict(inputs)

prefix = 'dict2_'
pos_output2 = {prefix + key: value for key, value in pos_output2.items()}

combined_dict = {**pos_output1, **pos_output2}
with open('best_pos_dict.pkl', 'wb') as f:
    pickle.dump(combined_dict, f)


In [None]:
neg_ids1 = neg_ids[0]
neg_masks1=neg_masks[0]
tensor_input_ids = tf.convert_to_tensor(neg_ids1)
tensor_attention_masks = tf.convert_to_tensor(neg_masks1)
#tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
#    'token_type_ids': tensor_token_type_ids
}

neg_output1 = neg_mod.predict(inputs)

neg_ids2 = neg_ids[1]
neg_masks2=neg_masks[1]
tensor_input_ids = tf.convert_to_tensor(neg_ids2)
tensor_attention_masks = tf.convert_to_tensor(neg_masks2)
#tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
#    'token_type_ids': tensor_token_type_ids
}

neg_output2 = neg_mod.predict(inputs)
prefix = 'dict2_'
neg_output2 = {prefix + key: value for key, value in neg_output2.items()}


neg_ids3 = neg_ids[2]
neg_masks3=neg_masks[2]
tensor_input_ids = tf.convert_to_tensor(neg_ids3)
tensor_attention_masks = tf.convert_to_tensor(neg_masks3)
#tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
#    'token_type_ids': tensor_token_type_ids
}

neg_output3 = neg_mod.predict(inputs)
prefix = 'dict3_'
neg_output3 = {prefix + key: value for key, value in neg_output3.items()}



neg_ids4 = neg_ids[3]
neg_masks4=neg_masks[3]
tensor_input_ids = tf.convert_to_tensor(neg_ids4)
tensor_attention_masks = tf.convert_to_tensor(neg_masks4)
#tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
#    'token_type_ids': tensor_token_type_ids
}

neg_output4 = neg_mod.predict(inputs)

prefix = 'dict4_'
neg_output4 = {prefix + key: value for key, value in neg_output4.items()}


combined_dict = {**neg_output1, **neg_output2, **neg_output3, **neg_output4}
with open('best_neg_dict.pkl', 'wb') as f:
    pickle.dump(combined_dict, f)

In [None]:
other_ids1 = other_ids[0]
other_masks1=other_masks[0]
tensor_input_ids = tf.convert_to_tensor(other_ids1)
tensor_attention_masks = tf.convert_to_tensor(other_masks1)
tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
    'token_type_ids': tensor_token_type_ids
}

other_output1 = other_mod.predict(inputs)
other_ids2 = other_ids[1]
other_masks2=other_masks[1]
tensor_input_ids = tf.convert_to_tensor(other_ids2)
tensor_attention_masks = tf.convert_to_tensor(other_masks2)
tensor_token_type_ids = tf.zeros_like(tensor_input_ids, dtype=tf.int32)
inputs = {
    'input_ids': tensor_input_ids,
    'attention_mask': tensor_attention_masks,
    'token_type_ids': tensor_token_type_ids
}

other_output2 = other_mod.predict(inputs)
prefix = 'dict2_'
other_output2 = {prefix + key: value for key, value in other_output2.items()}


combined_dict = {**other_output1, **other_output2}
with open('best_other_dict.pkl', 'wb') as f:
    pickle.dump(combined_dict, f)

In [None]:
#loading in our full dataset fresh as well as the pickle files created above
test = pd.read_excel('Data.xlsx')
pos_files = ['best_pos_dict.pkl']
neg_files =  ['best_neg_dict.pkl']
other_files = ['best_other_dict.pkl']

#loops through the positive pickle file, pulling each question's individual dictionary out, determining predictions
#and appending them to our full dataset
i = 0
print("POSITIVE")
for dat in pos_files:
    with open(dat, 'rb') as file:
        cur_dict = pickle.load(file)
        for key in cur_dict.keys():
            logs = cur_dict[key]
            probs = tf.nn.softmax(logs, axis = -1)
            mask = probs>.13  #our positive cutoff found through testing
            inds_above= tf.where(mask)
            preds_df = pd.DataFrame(inds_above)
            preds_df.columns = ['response_index', 'predicted_label']
            grouped = preds_df.groupby('response_index')['predicted_label'].agg(list).reset_index()
            lab_name= "pos_preds" + str(i)
            test[lab_name] = grouped['predicted_label']
            i = i+1
#loops through the negative pickle file, pulling each question's individual dictionary out, determining predictions
#and appending them to our full dataset     
i=0       
print("NEGATIVE")
for dat in neg_files:
    with open(dat, 'rb') as file:
        cur_dict = pickle.load(file)
        for key in cur_dict.keys():
            logs = cur_dict[key]
            probs = tf.nn.softmax(logs, axis = -1)
            mask = probs>.30 #our negative cutoff found through testing
            inds_above= tf.where(mask)
            preds_df = pd.DataFrame(inds_above)
            preds_df.columns = ['response_index', 'predicted_label']
            grouped = preds_df.groupby('response_index')['predicted_label'].agg(list).reset_index()
            lab_name= "neg_preds" + str(i)
            test[lab_name] = grouped['predicted_label']
            i = i+1
pd.set_option('display.max_rows', 5000)
#loops through the other pickle file, pulling each question's individual dictionary out, determining predictions
#and appending them to our full dataset
i=0
print("OTHER")
for dat in other_files:
    with open(dat, 'rb') as file:
        cur_dict = pickle.load(file)
        for key in cur_dict.keys():
            logs = cur_dict[key]
            probs = tf.nn.softmax(logs, axis = -1)
            mask = probs>.25 #our other cutoff found through testing
            inds_above= tf.where(mask)

            preds_df = pd.DataFrame(inds_above)
            #fixing some specific responses right here, will likely need to change
            preds_df.columns = ['response_index', 'predicted_label']
            if key == 'logits':
                new_row3 = {'response_index': 3483, 'predicted_label': 0}
                new_df = pd.DataFrame([new_row3])
                preds_df = pd.concat([preds_df, new_df], ignore_index=True)
            elif key== 'dict2_logits':
                new_row1 = {'response_index': 1196, 'predicted_label': 0}
                new_row2 = {'response_index': 3454, 'predicted_label': 0}
                
                
                new_df = pd.DataFrame([new_row1])
                preds_df = pd.concat([preds_df, new_df], ignore_index=True)

                new_df = pd.DataFrame([new_row2])
                preds_df = pd.concat([preds_df, new_df], ignore_index=True)
            #organizing all the data
            preds_df = preds_df.sort_values(by='response_index')
            preds_df = preds_df.reset_index(drop=True)
            grouped = preds_df.groupby('response_index')['predicted_label'].agg(list).reset_index()
            print(grouped)
            lab_name= "other_preds" + str(i)
            test[lab_name] = grouped['predicted_label']
            i = i+1


In [None]:
#Sends labelled data to a new excel file
test.to_excel('full_obs.xlsx', index = False)