# Loading Libraries

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import f1_score,confusion_matrix,classification_report,accuracy_score

import logging
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)



pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

## Utils functions 

In [5]:
def create_examples_prediction(df):
    """Creates examples for the training and dev sets."""
    examples = []
    for index, row in df.iterrows():
        
        #labels = row[LABEL_HOT_VECTOR].strip('][').split(', ')
        #labels = [float(x) for x in labels]
        labels = list(row[label_list_text])
        examples.append(labels)
        
    return pd.DataFrame(examples)

In [6]:
def f(x):
    n = 2  # index of the second proability to get labeled 

    index = np.argsort(x.values.flatten().tolist())[-n:][0]
    print(f"index is {index}")
    label  = label_list_text[index]
    print(f"label is {label}")
    
    return label
    

In [7]:
def get_test_experiment_df(test):
    test_predictions = [x[0]['probabilities'] for x in zip(getListPrediction(in_sentences=list(test[DATA_COLUMN])))]
    test_live_labels = np.array(test_predictions).argmax(axis=1)
    test['Predicted label'] = [label_list_text[x] for x in test_live_labels] # appending the labels to the dataframe
    
    probabilities_df_live = pd.DataFrame(test_predictions) # creating a proabilities dataset
    probabilities_df_live.columns = [x + " Predicted"for x in label_list_text] # naming the columns
    probabilities_df_live['Predicted label 2'] = probabilities_df_live.apply(lambda x:f(x),axis=1)
    
    #print(test)
    #label_df = create_examples_prediction(test)
    #label_df.columns = label_list_text
    #label_df['label 2'] = label_df.apply(lambda x:f(x),axis=1)

    test.reset_index(inplace=True,drop=True) # resetting index 

    experiment_df = pd.concat([test,probabilities_df_live],axis=1, ignore_index=False)
    experiment_df = experiment_df.reindex(sorted(experiment_df.columns), axis=1)
    return test,experiment_df

In [8]:
def getListPrediction(in_sentences):
    #1
    input_examples = [InputExample(guid="", text_a = x, text_b = None, labels = [0]*len(label_list)) for x in in_sentences] # here, "" is just a dummy label
    
    #2
    input_features = convert_examples_to_features(input_examples, MAX_SEQ_LENGTH, tokenizer)
    
    #3
    predict_input_fn = input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    
    print(input_features[0].input_ids)
    #4
    predictions = estimator.predict(input_fn=predict_input_fn,yield_single_examples=True)
    
    return predictions

In [9]:
is_normalize_active=False

def get_confusion_matrix(y_test,predicted,labels):
    class_names=labels
    # plotting confusion matrix
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plot_confusion_matrix(y_test, predicted, classes=class_names,
                        title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plot_confusion_matrix(y_test, predicted, classes=class_names, normalize=True,
                        title='Normalized confusion matrix')
    plt.show()
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes =classes
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        test =1
        #print('Confusion matrix, without normalization')

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    #ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    #fig.tight_layout()
    return ax


# Loading the data

In [10]:
def data_prep_bert(df,test_size):
    
    #print("Filling missing values")
    #df[DATA_COLUMN] = df[DATA_COLUMN].fillna('_NA_')
    
    print("Splitting dataframe with shape {} into training and test datasets".format(df.shape))
    X_train, X_test  = train_test_split(df, test_size=test_size, random_state=2018,stratify = df[LABEL_COLUMN_RAW])

    return X_train, X_test


In [11]:
def open_dataset(NAME,mapping_index,excluded_categories):
    df = pd.read_csv(PATH+NAME+'.csv',sep =',')
    
    #df[LABEL_COLUMN_RAW] = df[LABEL_COLUMN_RAW].fillna("Other")

    
    df = df[df['is_stressor'] == 1]
    df = df[df[LABEL_COLUMN_RAW] != 'Not Stressful']
    #df.columns = [LABEL_COLUMN_RAW,'Severity',DATA_COLUMN,'Source']
    
    if excluded_categories is not None:
        for category in excluded_categories:

            df = df[df[LABEL_COLUMN_RAW] !=category]

    label_list=[]
    label_list_final =[]
    if(mapping_index is None):
        df[LABEL_COLUMN_RAW] = df[LABEL_COLUMN_RAW].astype('category')
        df[LABEL_COLUMN], mapping_index = pd.Series(df[LABEL_COLUMN_RAW]).factorize() #uses pandas factorize() to convert to numerical index
        
  
    else:
        df[LABEL_COLUMN] = df[LABEL_COLUMN_RAW].apply(lambda x: mapping_index.get_loc(x))
    
    label_list_final = [None] * len(mapping_index.categories)
    label_list_number = [None] * len(mapping_index.categories)

    for index,ele in enumerate(list(mapping_index.categories)):
        lindex = mapping_index.get_loc(ele)
        label_list_number[lindex] = lindex
        label_list_final[lindex] = ele
    
    frequency_dict = df[LABEL_COLUMN_RAW].value_counts().to_dict()
    df["class_freq"] = df[LABEL_COLUMN_RAW].apply(lambda x: frequency_dict[x])
    
    
    return df,mapping_index,label_list_number,label_list_final
    

# Require user changes > Start Here 

### Experiment Name

In [12]:
PATH = './datasets/'
TODAY_DATE = "27_04_2020/"
EXPERIMENT_NAME = 'main_turk_analysis_of_5_turkers_popbots_test_live_10votes'
EXPERIMENTS_PATH = PATH + 'experiments/'+TODAY_DATE+EXPERIMENT_NAME
if not os.path.exists(PATH + 'experiments/'+TODAY_DATE):
    os.mkdir(PATH + 'experiments/'+TODAY_DATE)
if not os.path.exists(EXPERIMENTS_PATH):
    os.mkdir(EXPERIMENTS_PATH)

### Model Hyperparameters

In [13]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 100

# We'll set sequences to be at most 32 tokens long.
MAX_SEQ_LENGTH = 32


OUTPUT_DIR = './models/'+EXPERIMENT_NAME+ '/' #_01_04_2020/

##use downloaded model, change path accordingly
BERT_VOCAB= './bert_model/uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = './bert_model/uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = './bert_model/uncased_L-12_H-768_A-12/bert_config.json'


In [14]:
DATASET_NAME = '2020-04-28-Main-turk-aggregation-5-turkers'

DATA_COLUMN = 'Input.text'
LABEL_COLUMN_RAW = 'labels'#'Answer.Label'
LABEL_COLUMN = 'label_numeric'

MTURK_NAME = 'mTurk_synthetic'
LIVE_NAME = 'popbots_live'

LABEL_HOT_VECTOR = 'label_conf'

#dataset,mapping_index,label_list, label_list_text = open_dataset('mturk900balanced',None)

EXCLUDED_CATEGORIES = None #['Other'] #None # # if nothing to exclude put None, THIS ALWAYS MUST BE A LIST 
mapping_dict = {'Other': 0, 'Everyday Decision Making': 1, 'Work': 2, 'Social Relationships': 3, 'Financial Problem': 4, 'Emotional Turmoil': 5, 'Health, Fatigue, or Physical Pain': 6, 'School': 7, 'Family Issues': 8}#,'Not Stressful':9}
mapping_index = pd.CategoricalIndex([key for key,value in mapping_dict.items()])

dataset,mapping_index,label_list, label_list_text = open_dataset(DATASET_NAME,mapping_index,EXCLUDED_CATEGORIES)

#dataset = dataset[dataset['is_stressor'] == 1]

test_on_mturk_and_popbots_live = True # include live data in training + include mturk in testing


if test_on_mturk_and_popbots_live:
    
    mturk = dataset[dataset['Source']== MTURK_NAME]
    live = dataset[dataset['Source']== LIVE_NAME]
    live = live.sample(frac=1).reset_index(drop=True) # shuffle live
    
    PERCENTAGE_LIVE_TEST = 70
    
    TEST_PERCENTAGE = len(live)/((100/PERCENTAGE_LIVE_TEST)*len(mturk))  # given to set the percentage of mturk used as test set to have 50/50
    
    print(f"Test percentage is {TEST_PERCENTAGE}")

    train,test = data_prep_bert(mturk,TEST_PERCENTAGE) # test size from mturk 
    
    train = train.append(live.loc[0:int((1-(PERCENTAGE_LIVE_TEST/100))*len(live))]) # taking 1/2 of that dataset for training
    
    test = test.append(live.loc[int(len(live)*(1-(PERCENTAGE_LIVE_TEST/100))):int(len(live))]) # taking 1/2 of live dataset for testing
else:
    # or taking live only for testing
    train,test = dataset[dataset['Source']== MTURK_NAME],dataset[dataset['Source']== LIVE_NAME] 

#train = train[train['is_stressor'] == 1] # remove only non stressor from train

#print(f"Dataset has {len(dataset)} training examples")
print(f"Normal label list is {label_list}")
print(f"The labels text is {label_list_text}")

#Export train test to csv
#train.to_csv(PATH+'900_CSV_SPLITTED/train.csv')
#test.to_csv(PATH+'900_CSV_SPLITTED/test.csv')

Test percentage is 0.050670241286863274
Splitting dataframe with shape (1865, 21) into training and test datasets
Normal label list is [0, 1, 2, 3, 4, 5, 6, 7, 8]
The labels text is ['Other', 'Everyday Decision Making', 'Work', 'Social Relationships', 'Financial Problem', 'Emotional Turmoil', 'Health, Fatigue, or Physical Pain', 'School', 'Family Issues']


In [31]:

df_columns = ['category', 'nb_sentence','distinct_word_nb','distinc_word_per_sentence']
count_results = pd.DataFrame(columns = df_columns)

for category in label_list_text:
    
    category_df = dataset[dataset[LABEL_COLUMN_RAW] == category]
    category_df[DATA_COLUMN].str.lower().str.split()
    results = set()
    category_df[DATA_COLUMN].str.lower().str.split().apply(results.update)
    count_results = count_results.append({'category':category,'nb_sentence':len(category_df),'distinct_word_nb':len(list(results)),'distinc_word_per_sentence':len(list(results))/len(category_df)}, ignore_index=True)



In [32]:
count_results.sort_values(by=['distinc_word_per_sentence'])

Unnamed: 0,category,nb_sentence,distinct_word_nb,distinc_word_per_sentence
2,Work,709,1330,1.875882
4,Financial Problem,339,754,2.224189
7,School,158,409,2.588608
8,Family Issues,235,758,3.225532
6,"Health, Fatigue, or Physical Pain",129,434,3.364341
1,Everyday Decision Making,116,415,3.577586
3,Social Relationships,127,470,3.700787
0,Other,111,436,3.927928
5,Emotional Turmoil,76,338,4.447368


### Train set and test set analysis

In [407]:
def print_dataset_info(train,test):
    print(f"Train size {len(train)} with {len(train[train['Source']== LIVE_NAME])} from Popbots and {len(train[train['Source']== MTURK_NAME])} from mturk")
    print(f"Test size {len(test)} with {len(test[test['Source']== LIVE_NAME])} from Popbots and {len(test[test['Source']== MTURK_NAME])} from mturk")
    
    print('\nTraining distribution:')
    print(pd.pivot_table(train[[LABEL_COLUMN_RAW, 'Source']],index=[LABEL_COLUMN_RAW, 'Source'],columns=None, aggfunc=len)) #.to_clipboard(excel=True)
          
    print('\nTesting distribution:')
    print(pd.pivot_table(test[[LABEL_COLUMN_RAW, 'Source']],index=[LABEL_COLUMN_RAW, 'Source'],columns=None, aggfunc=len)) #.to_clipboard(excel=True)

In [408]:
len(test)

135

In [409]:
train = train.sample(frac=1).reset_index(drop=True) #reshuffle everything
test = test.sample(frac=1).reset_index(drop=True)

In [410]:
print('\nAll dataset distribution:')
print(pd.pivot_table(dataset[[LABEL_COLUMN_RAW, 'Source']],index=[LABEL_COLUMN_RAW, 'Source'],columns=None, aggfunc=len)) #.to_clipboard(excel=T


All dataset distribution:
labels                             Source         
Emotional Turmoil                  mTurk_synthetic     70
                                   popbots_live         6
Everyday Decision Making           mTurk_synthetic     99
                                   popbots_live        17
Family Issues                      mTurk_synthetic    225
                                   popbots_live        10
Financial Problem                  mTurk_synthetic    337
                                   popbots_live         2
Health, Fatigue, or Physical Pain  mTurk_synthetic    109
                                   popbots_live        20
Other                              mTurk_synthetic     97
                                   popbots_live        14
School                             mTurk_synthetic    145
                                   popbots_live        13
Social Relationships               mTurk_synthetic    113
                                   popbots_live     

In [420]:
print_dataset_info(train,test)

Train size 1811 with 41 from Popbots and 1770 from mturk
Test size 190 with 95 from Popbots and 95 from mturk

Training distribution:
labels                             Source         
Emotional Turmoil                  mTurk_synthetic     66
                                   popbots_live         2
Everyday Decision Making           mTurk_synthetic     94
                                   popbots_live         4
Family Issues                      mTurk_synthetic    214
                                   popbots_live         2
Financial Problem                  mTurk_synthetic    320
Health, Fatigue, or Physical Pain  mTurk_synthetic    103
                                   popbots_live         6
Other                              mTurk_synthetic     92
                                   popbots_live         5
School                             mTurk_synthetic    138
                                   popbots_live         3
Social Relationships               mTurk_synthetic    107
   

### Step to reduce the most dominant categories and balance the dataset

In [323]:
sampling_cutoff = 100 # all the categories which had less than 100 example won't be sampled down
total_training_size = 1501

REVERSE_FREQ = 'Max_reverse_sampling_chance'
train[REVERSE_FREQ] = train['class_freq'].apply(lambda x: (max(train['class_freq'])/x)) 

sampling_boolean = (train['Source'] != LIVE_NAME) & (train['class_freq'].astype(float) > sampling_cutoff) 


train_to_be_balanced = train[sampling_boolean]
train_not_resampled = train[~sampling_boolean]

train_temp = train_to_be_balanced.sample(n=(total_training_size-len(train_not_resampled)), weights=REVERSE_FREQ, random_state=2020)
train = pd.concat([train_temp,train_not_resampled])

In [324]:
print_dataset_info(train,test)

Train size 1501 with 0 from Popbots and 1501 from mturk
Test size 135 with 135 from Popbots and 0 from mturk

Training distribution:
labels                             Source         
Emotional Turmoil                  mTurk_synthetic     70
Everyday Decision Making           mTurk_synthetic     99
Family Issues                      mTurk_synthetic    209
Financial Problem                  mTurk_synthetic    273
Health, Fatigue, or Physical Pain  mTurk_synthetic    109
Other                              mTurk_synthetic     97
School                             mTurk_synthetic    140
Social Relationships               mTurk_synthetic    113
Work                               mTurk_synthetic    391
dtype: int64

Testing distribution:
labels                             Source      
Emotional Turmoil                  popbots_live     6
Everyday Decision Making           popbots_live    17
Family Issues                      popbots_live    10
Financial Problem                  popbots_live 

In [325]:
mapping_index

CategoricalIndex(['Other', 'Everyday Decision Making', 'Work',
                  'Social Relationships', 'Financial Problem',
                  'Emotional Turmoil', 'Health, Fatigue, or Physical Pain',
                  'School', 'Family Issues'],
                 categories=['Emotional Turmoil', 'Everyday Decision Making', 'Family Issues', 'Financial Problem', 'Health, Fatigue, or Physical Pain', 'Other', 'School', 'Social Relationships', ...], ordered=False, dtype='category')

In [326]:
train.to_csv(EXPERIMENTS_PATH+'/TRAIN_'+DATASET_NAME+'.csv')
test.to_csv(EXPERIMENTS_PATH+'/TEST_'+DATASET_NAME+'.csv')