# Licences / Notes

In [1]:
# Copyright 2019 Google Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
#Adapted by Thierry Lincoln in November,2019 from this Colab notebook:
#https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb.
#Changes includes 

# - Reading our stressor data and parsing it properly
# - reconfiguring the last layer to include N neurons corresponding to N categories
# - correcting the probability output so that it follows [0,1] proper pattern 
# - better analysis with confusion matrix
# - exporting to pb format for tensorflow serving api

Intro:

If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.

Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.

Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!

# Loading Libraries

In [3]:
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/lib'
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0" #(or "1" or "2")

In [4]:
import sys
print(sys.executable)

/commuter/thierrylincoln/Tf1.1_py36/bin/python3.6


In [5]:
#export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/lib
#export CUDA_VISIBLE_DEVICES=0

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import f1_score,confusion_matrix,classification_report,accuracy_score

import logging
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)



pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [7]:
print(tf.__version__) #needs to be version 1.15.0, version 2.0 doesn't work with this notebook

1.15.0


In [8]:

config = tf.ConfigProto()
#config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
#config.gpu_options.visible_device_list="0"



from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 4531308141647339126, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 409628635060813175
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 8222147382854925159
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 133431296
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 15294584257083659011
 physical_device_desc: "device: 0, name: TITAN X (Pascal), pci bus id: 0000:03:00.0, compute capability: 6.1"]

In addition to the standard libraries we imported above, we'll need to install BERT's python package.

In [9]:
#!pip install bert-tensorflow

In [10]:
import bert
from bert import run_classifier_with_tfhub
from bert import optimization
from bert import tokenization
from bert import modeling 
import numpy as np






Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. 


Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist).

## Utils functions 

In [11]:
def create_examples_prediction(df):
    """Creates examples for the training and dev sets."""
    examples = []
    for index, row in df.iterrows():
        
        #labels = row[LABEL_HOT_VECTOR].strip('][').split(', ')
        #labels = [float(x) for x in labels]
        labels = list(row[label_list_text])
        examples.append(labels)
        
    return pd.DataFrame(examples)

In [12]:
def f(x):
    n = 2  # index of the second proability to get labeled 

    index = np.argsort(x.values.flatten().tolist())[-n:][0]
    print(f"index is {index}")
    label  = label_list_text[index]
    print(f"label is {label}")
    
    return label
    

In [13]:
final_columns = ["sOrder","Input.text","is_stressor","is_stressor_conf","top_label","second_label","Branch", "Above SD-THRESHOLD","SD-THRESHOLD","SD","Other","Everyday Decision Making","Work","Social Relationships","Financial Problem","Health, Fatigue, or Physical Pain","Emotional Turmoil","Family Issues","School","avg_severity","median_severity","SD_severity","Votes","Source"]

In [14]:
def get_test_experiment_df(test):
    test_predictions = [x[0]['probabilities'] for x in zip(getListPrediction(in_sentences=list(test[DATA_COLUMN])))]
    test_live_labels = np.array(test_predictions).argmax(axis=1)
    test[LABEL_COLUMN_RAW] = [label_list_text[x] for x in test_live_labels] # appending the labels to the dataframe
    
    probabilities_df_live = pd.DataFrame(test_predictions) # creating a proabilities dataset
    probabilities_df_live.columns = [x for x in label_list_text] # naming the columns
    probabilities_df_live['second_label'] = probabilities_df_live.apply(lambda x:f(x),axis=1)
    
    #print(test)
    #label_df = create_examples_prediction(test)
    #label_df.columns = label_list_text
    #label_df['label 2'] = label_df.apply(lambda x:f(x),axis=1)

    test.reset_index(inplace=True,drop=True) # resetting index 
    
    test_removed_columns =  list(set(test.columns)-set(probabilities_df_live.columns))
    
    test_temp = test[test_removed_columns]
    
    experiment_df = pd.concat([test_temp,probabilities_df_live],axis=1, ignore_index=False)
    
    
    missing_cols = list(set(experiment_df.columns)-set(final_columns))
    experiment_df[missing_cols] = np.nan#.loc[:, missing_cols] = np.nan
        
    experiment_df = experiment_df.reindex(columns = final_columns)

    
    #experiment_df = experiment_df.reindex(sorted(experiment_df.columns), axis=1)
    
    return test,experiment_df

In [15]:
def getListPrediction(in_sentences):
    #1
    input_examples = [InputExample(guid="", text_a = x, text_b = None, labels = [0]*len(label_list)) for x in in_sentences] # here, "" is just a dummy label
    
    #2
    input_features = convert_examples_to_features(input_examples, MAX_SEQ_LENGTH, tokenizer)
    
    #3
    predict_input_fn = input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    
    print(input_features[0].input_ids)
    #4
    predictions = estimator.predict(input_fn=predict_input_fn,yield_single_examples=True)
    
    return predictions

In [16]:
is_normalize_active=False

def get_confusion_matrix(y_test,predicted,labels):
    class_names=labels
    # plotting confusion matrix
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plot_confusion_matrix(y_test, predicted, classes=class_names,
                        title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plot_confusion_matrix(y_test, predicted, classes=class_names, normalize=True,
                        title='Normalized confusion matrix')
    plt.show()
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes =classes
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        test =1
        #print('Confusion matrix, without normalization')

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    #ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    #fig.tight_layout()
    return ax


# Loading the data

In [17]:
def data_prep_bert(df,test_size):
    
    #print("Filling missing values")
    #df[DATA_COLUMN] = df[DATA_COLUMN].fillna('_NA_')
    
    print("Splitting dataframe with shape {} into training and test datasets".format(df.shape))
    X_train, X_test  = train_test_split(df, test_size=test_size, random_state=2018,stratify = df[LABEL_COLUMN_RAW])

    return X_train, X_test


In [18]:
def open_dataset(NAME,mapping_index,excluded_categories):
    df = pd.read_csv(PATH+NAME+'.csv',sep =',')
    
    #df[LABEL_COLUMN_RAW] = df[LABEL_COLUMN_RAW].fillna("Other")

    
    df = df[df['is_stressor'] == 1]
    df = df[df[LABEL_COLUMN_RAW] != 'Not Stressful']
    #df.columns = [LABEL_COLUMN_RAW,'Severity',DATA_COLUMN,'Source']
    
    if excluded_categories is not None:
        for category in excluded_categories:

            df = df[df[LABEL_COLUMN_RAW] !=category]

    label_list=[]
    label_list_final =[]
    if(mapping_index is None):
        df[LABEL_COLUMN_RAW] = df[LABEL_COLUMN_RAW].astype('category')
        df[LABEL_COLUMN], mapping_index = pd.Series(df[LABEL_COLUMN_RAW]).factorize() #uses pandas factorize() to convert to numerical index
        
  
    else:
        df[LABEL_COLUMN] = df[LABEL_COLUMN_RAW].apply(lambda x: mapping_index.get_loc(x))
    
    label_list_final = [None] * len(mapping_index.categories)
    label_list_number = [None] * len(mapping_index.categories)

    for index,ele in enumerate(list(mapping_index.categories)):
        lindex = mapping_index.get_loc(ele)
        label_list_number[lindex] = lindex
        label_list_final[lindex] = ele
    
    frequency_dict = df[LABEL_COLUMN_RAW].value_counts().to_dict()
    df["class_freq"] = df[LABEL_COLUMN_RAW].apply(lambda x: frequency_dict[x])
    
    
    return df,mapping_index,label_list_number,label_list_final
    

# Require user changes > Start Here 

### Experiment Name

In [19]:
PATH = './datasets/'
TODAY_DATE = "27_07_2020/"
EXPERIMENT_NAME = 'newly_labelled'
EXPERIMENTS_PATH = PATH + 'experiments/'+TODAY_DATE+EXPERIMENT_NAME
if not os.path.exists(PATH + 'experiments/'+TODAY_DATE):
    os.mkdir(PATH + 'experiments/'+TODAY_DATE)
if not os.path.exists(EXPERIMENTS_PATH):
    os.mkdir(EXPERIMENTS_PATH)

### Model Hyperparameters

In [20]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 100

# We'll set sequences to be at most 32 tokens long.
MAX_SEQ_LENGTH = 32


OUTPUT_DIR = './models/'+ TODAY_DATE+EXPERIMENT_NAME+'/' #_01_04_2020/

##use downloaded model, change path accordingly
BERT_VOCAB= './bert_model/uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = './bert_model/uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = './bert_model/uncased_L-12_H-768_A-12/bert_config.json'


In [26]:
DATASET_NAME = '2020-06-20-MainTurkAggregation-5-Turkers_v0'

DATA_COLUMN = 'Input.text'
LABEL_COLUMN_RAW = 'top_label'#'Answer.Label'
LABEL_COLUMN = 'label_numeric'

MTURK_NAME = 'mTurk_synthetic'
LIVE_NAME = 'popbots_live'
INQUIRE_NAME = 'Inquire'
MTURK_COVID_NAME = 'mTurk_synthetic_covid'

LABEL_HOT_VECTOR = 'label_conf'



#dataset,mapping_index,label_list, label_list_text = open_dataset('mturk900balanced',None)

EXCLUDED_CATEGORIES = None #['Other'] #None # # if nothing to exclude put None, THIS ALWAYS MUST BE A LIST 
mapping_dict = {'Other': 0, 'Everyday Decision Making': 1, 'Work': 2, 'Social Relationships': 3, 'Financial Problem': 4, 'Emotional Turmoil': 5, 'Health, Fatigue, or Physical Pain': 6, 'School': 7, 'Family Issues': 8}#,'Not Stressful':9}
mapping_index = pd.CategoricalIndex([key for key,value in mapping_dict.items()])

dataset,mapping_index,label_list, label_list_text = open_dataset(DATASET_NAME,mapping_index,EXCLUDED_CATEGORIES)

dataset = dataset[dataset['is_stressor'] == 1]
#dataset = dataset[dataset['is_covid'] == 0]

test_on_mturk_and_popbots_live = False #False # include live data in training + include mturk in testing


if test_on_mturk_and_popbots_live:
    
    mturk = dataset[(dataset['Source']== MTURK_NAME) | (dataset['Source']== MTURK_COVID_NAME)]
    live = dataset[dataset['Source']== LIVE_NAME]
    live = live.sample(frac=1).reset_index(drop=True) # shuffle live
    PERCENTAGE_LIVE_TEST = 300
    
    TEST_PERCENTAGE = len(live)/((100/PERCENTAGE_LIVE_TEST)*len(mturk))  # given to set the percentage of mturk used as test set to have 50/50
    
    print(f"Test percentage is {TEST_PERCENTAGE}")

    train,test = data_prep_bert(mturk,TEST_PERCENTAGE) # test size from mturk 
    
    train = train.append(live.loc[0:int((1-(PERCENTAGE_LIVE_TEST/100))*len(live))]) # taking 1/2 of that dataset for training
    #train = train.append(dataset[dataset['Source'] == INQUIRE_NAME] )
    test = test.append(live.loc[int(len(live)*(1-(PERCENTAGE_LIVE_TEST/100))):int(len(live))]) # taking 1/2 of live dataset for testing
else:
    train_sources = ['Inquire','InquireComposited','mTurk_synthetic','mTurk_synthetic_covid','mTurk_synthetic_covid_2']
    # or taking live only for testing
    train = dataset[dataset['Source'].isin(train_sources)]
    test = dataset[dataset['Source']== LIVE_NAME] 

train = train[train['is_stressor'] == 1] # remove only non stressor from train

#print(f"Dataset has {len(dataset)} training examples")
print(f"Normal label list is {label_list}")
print(f"The labels text is {label_list_text}")

#Export train test to csv
#train.to_csv(PATH+'900_CSV_SPLITTED/train.csv')
#test.to_csv(PATH+'900_CSV_SPLITTED/test.csv')

Normal label list is [0, 1, 2, 3, 4, 5, 6, 7, 8]
The labels text is ['Other', 'Everyday Decision Making', 'Work', 'Social Relationships', 'Financial Problem', 'Emotional Turmoil', 'Health, Fatigue, or Physical Pain', 'School', 'Family Issues']


### Train set and test set analysis

In [27]:
def print_dataset_info(train,test):
    print(f"Train size {len(train)} with {len(train[train['Source']== LIVE_NAME])} from Popbots and {len(train[train['Source']== MTURK_NAME])} from mturk")
    print(f"Test size {len(test)} with {len(test[test['Source']== LIVE_NAME])} from Popbots and {len(test[test['Source']== MTURK_NAME])} from mturk")
    
    print('\nTraining distribution:')
    print(pd.pivot_table(train[[LABEL_COLUMN_RAW, 'Source']],index=[LABEL_COLUMN_RAW, 'Source'],columns=None, aggfunc=len)) #.to_clipboard(excel=True)
          
    print('\nTesting distribution:')
    print(pd.pivot_table(test[[LABEL_COLUMN_RAW, 'Source']],index=[LABEL_COLUMN_RAW, 'Source'],columns=None, aggfunc=len)) #.to_clipboard(excel=True)

In [28]:
train = train.sample(frac=1).reset_index(drop=True) #reshuffle everything
test = test.sample(frac=1).reset_index(drop=True)

In [29]:
print('\nAll dataset distribution:')
print(pd.pivot_table(dataset[[LABEL_COLUMN_RAW, 'Source']],index=[LABEL_COLUMN_RAW, 'Source'],columns=None, aggfunc=len)) #.to_clipboard(excel=T


All dataset distribution:
top_label                          Source                 
Emotional Turmoil                  Inquire                    380
                                   InquireComposited          106
                                   mTurk_synthetic             68
                                   mTurk_synthetic_covid       91
                                   mTurk_synthetic_covid_2     20
                                   popbots_live                 6
Everyday Decision Making           Inquire                    132
                                   InquireComposited           60
                                   mTurk_synthetic             90
                                   mTurk_synthetic_covid       36
                                   mTurk_synthetic_covid_2      6
                                   popbots_live                14
Family Issues                      Inquire                    407
                                   InquireComposited    

In [30]:
print_dataset_info(train,test)

Train size 6378 with 0 from Popbots and 1922 from mturk
Test size 141 with 141 from Popbots and 0 from mturk

Training distribution:
top_label                          Source                 
Emotional Turmoil                  Inquire                    380
                                   InquireComposited          106
                                   mTurk_synthetic             68
                                   mTurk_synthetic_covid       91
                                   mTurk_synthetic_covid_2     20
Everyday Decision Making           Inquire                    132
                                   InquireComposited           60
                                   mTurk_synthetic             90
                                   mTurk_synthetic_covid       36
                                   mTurk_synthetic_covid_2      6
Family Issues                      Inquire                    407
                                   InquireComposited           28
                

### Step to reduce the most dominant categories and balance the dataset

sampling_cutoff = 108 # all the categories which had less than 100 example won't be sampled down
sampling_number = 107

frequency_dict = train[LABEL_COLUMN_RAW].value_counts().to_dict()
train["class_freq"] = train[LABEL_COLUMN_RAW].apply(lambda x: frequency_dict[x])

sampling_boolean = (train['Source'] != LIVE_NAME) & (train['class_freq'].astype(float) > sampling_cutoff) 
train_to_be_balanced = train[sampling_boolean]
train_not_resampled = train[~sampling_boolean]
train_temp = train_to_be_balanced.groupby(LABEL_COLUMN).apply(lambda x: x.sample(sampling_number)).reset_index(drop=True).sample(frac=1).reset_index(drop=True)
train = pd.concat([train_temp,train_not_resampled])

In [25]:
sampling_cutoff = 250 # all the categories which had less than 100 example won't be sampled down
total_training_size = 1501

REVERSE_FREQ = 'Max_reverse_sampling_chance'
train[REVERSE_FREQ] = train['class_freq'].apply(lambda x: (max(train['class_freq'])/x)) 

sampling_boolean = (train['Source'] != LIVE_NAME) & (train['class_freq'].astype(float) > sampling_cutoff) 


train_to_be_balanced = train[sampling_boolean]
train_not_resampled = train[~sampling_boolean]

train_temp = train_to_be_balanced.sample(n=(total_training_size-len(train_not_resampled)), weights=REVERSE_FREQ, random_state=2020)
train = pd.concat([train_temp,train_not_resampled])



In [150]:
print_dataset_info(train,test)

Train size 1501 with 127 from Popbots and 758 from mturk
Test size 30 with 15 from Popbots and 14 from mturk

Training distribution:
top_label                          Source               
Emotional Turmoil                  Inquire                   51
                                   mTurk_synthetic           67
                                   mTurk_synthetic_covid     91
                                   popbots_live               4
Everyday Decision Making           Inquire                    7
                                   mTurk_synthetic           89
                                   mTurk_synthetic_covid     36
                                   popbots_live              12
Family Issues                      Inquire                    7
                                   mTurk_synthetic           79
                                   mTurk_synthetic_covid     32
                                   popbots_live              10
Financial Problem                  mTurk_s

In [105]:
mapping_index

CategoricalIndex(['Other', 'Everyday Decision Making', 'Work',
                  'Social Relationships', 'Financial Problem',
                  'Emotional Turmoil', 'Health, Fatigue, or Physical Pain',
                  'School', 'Family Issues'],
                 categories=['Emotional Turmoil', 'Everyday Decision Making', 'Family Issues', 'Financial Problem', 'Health, Fatigue, or Physical Pain', 'Other', 'School', 'Social Relationships', ...], ordered=False, dtype='category')

In [23]:
train = train.sample(frac=1).reset_index(drop=True) #reshuffle everything

In [194]:
train.to_csv(EXPERIMENTS_PATH+'/TRAIN_'+DATASET_NAME+'.csv')
test.to_csv(EXPERIMENTS_PATH+'/TEST_'+DATASET_NAME+'.csv')

In [22]:
train = pd.read_csv('./datasets/experiments/18_06_2020/test_covid/TRAIN_2020-06-08-MainTurkAggregation-5-Turkers_v0.csv')
test = pd.read_csv('./datasets/experiments/18_06_2020/test_covid/TEST_2020-06-08-MainTurkAggregation-5-Turkers_v0.csv')

In [51]:
train

Unnamed: 0.1,Unnamed: 0,sID,Input.text,is_stressor,is_stressor_conf,is_covid,is_covid_conf,top_label,second_label,avg_severity,median_severity,SD_severity,Votes,Work,"Health, Fatigue, or Physical Pain",Other,Emotional Turmoil,Everyday Decision Making,Family Issues,Social Relationships,Financial Problem,School,Source,class_freq
0,2787,2268,o have a lot on my plate right now with work and school,1,100.00%,0,100.00%,Work,School,3.0,3,0.63,9,0.4,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.4,mTurk_synthetic,3195
1,2549,928,i have been stressed about my financial situation because we currently have so much debt,1,100.00%,0,100.00%,Financial Problem,Emotional Turmoil,6.6,8,3.07,6,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.7,0.0,mTurk_synthetic,3195
2,1996,2440,"Seriously, What's NOT stressing me out? #covid19#homeschooling#notreadyforthis",1,100.00%,1,100.00%,School,Emotional Turmoil,5.6,5,1.62,9,0.0,0.2,0.0,0.3,0.1,0.0,0.1,0.0,0.3,mTurk_synthetic_covid,676
3,1608,2042,my mother in law is annoying me so much,1,100.00%,0,100.00%,Family Issues,Emotional Turmoil,3.6,4,0.49,6,0.0,0.0,0.0,0.2,0.0,0.7,0.1,0.0,0.0,mTurk_synthetic,3195
4,1106,128,boss has been frustrating me!,1,100.00%,0,100.00%,Work,,4.6,4,3.20,5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mTurk_synthetic,3195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2879,1625,1484,I've been really stressed about the covid-19 issues that are going on,1,100.00%,1,100.00%,Emotional Turmoil,Other,4.8,4,1.47,8,0.2,0.1,0.2,0.4,0.1,0.0,0.0,0.0,0.0,mTurk_synthetic_covid,676
2880,1057,447,"Hey, I am in the middle of final exams. Just found out if I fail this exam, I fail all.",1,100.00%,0,60.00%,School,Social Relationships,6.4,6,2.06,6,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.1,0.5,mTurk_synthetic_covid,3195
2881,135,1755,my appointment with the dental surgeon has been stressing me out recently.,1,100.00%,0,100.00%,"Health, Fatigue, or Physical Pain",Other,3.4,3,1.02,5,0.0,0.8,0.2,0.0,0.0,0.0,0.0,0.0,0.0,mTurk_synthetic,3195
2882,413,2655,the lack of leadership at work.,1,100.00%,0,100.00%,Work,Social Relationships,5.0,5,2.45,7,0.6,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0,mTurk_synthetic,3195


# Require user changes > STOP Here 

# Data Preprocessing

In [31]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, labels=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids, is_real_example=True):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_ids,
        self.is_real_example=is_real_example



In [32]:
def create_examples(df, labels_available=True):
    """Creates examples for the training and dev sets."""
    examples = []
    for index, row in df.iterrows():
        guid = None
        text_a = row[DATA_COLUMN]
        if labels_available:
            
            #labels = row[LABEL_HOT_VECTOR].strip('][').split(', ')
            #labels = [float(x) for x in labels]
            labels = list(row[label_list_text])
            #print(text_a)
            #print(label_list_text)
            #print(labels)
            
        else:
            labels = [0,0,0,0,0,0]
        examples.append(
            InputExample(guid=guid, text_a=text_a, labels=labels))
    return examples

In [33]:
def convert_examples_to_features(examples,  max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        print(example.text_a)
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        labels_ids = []
        for label in example.labels:
            labels_ids.append(float(label))

        if ex_index < 0:
            pass
        logging.info("*** Example ***")
        logging.info("guid: %s" % (example.guid))
        logging.info("tokens: %s" % " ".join(
                [str(x) for x in tokens]))
        logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logging.info(
                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        logging.info("label: %s (id = %s)" % (example.labels, labels_ids))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_ids=labels_ids))
    return features

In [34]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """
    
    
def convert_single_example(ex_index, example, max_seq_length,
                           tokenizer):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_ids=0,
            is_real_example=False)

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    labels_ids = []
    for label in example.labels:
        labels_ids.append(int(label))


    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=labels_ids,
        is_real_example=True)
    return feature


def file_based_convert_examples_to_features(
        examples, max_seq_length, tokenizer, output_file):
    """Convert a set of `InputExample`s to a TFRecord file."""

    writer = tf.python_io.TFRecordWriter(output_file)

    for (ex_index, example) in enumerate(examples):
        #if ex_index % 10000 == 0:
            #tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(ex_index, example,
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])
        if isinstance(feature.label_ids, list):
            label_ids = feature.label_ids
        else:
            label_ids = feature.label_ids[0]
        features["label_ids"] = create_int_feature(label_ids)

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()


def file_based_input_fn_builder(input_file, seq_length, is_training,
                                drop_remainder):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([6], tf.int64),
        "is_real_example": tf.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t

        return example

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))

        return d

    return input_fn


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

For us, our input data is the 'sentence' column and our label is the 'polarity' column 

#Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [28]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
"""
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

#test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
                                                                   
"""

train_InputExamples = create_examples(train)


Next, we need to preprocess our data so that it matches the data BERT was 

1.   List item
2.   List item

trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):


1. Lowercase our text (if we're using a BERT lowercase model)
2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
4. Map our words to indexes using a vocab file that BERT provides
5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details.




To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:

In [29]:
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)





Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info["do_lower_case"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:

In [30]:
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [31]:
# Convert our train and test features to InputFeatures that BERT understands.
#train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
#test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

train_features = convert_examples_to_features(examples=train_InputExamples,  max_seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)

i moved in with my boyfriend too quickly and i realize i don't really like him now.
not being creative at all.
i have been dating two guys that i really like and have to make a choice between them.
I have been feeling stressed about the COVID-19 situation happening worldwide.
i cannot get anyone at the main office at work to answer my questions.
work's been really hard. im not making enough
my car had recently started running hot and i don't know what to do.
Work has been pretty crazy lately
The corona virus and quarantine
Our family is quarantined due to the COVID-19.
schoolwork has been stressing me out. the final is next week!
Several layoffs at work
i have a really big test coming up next week.
Worried about not being able to find a job after college
losing pay at my job
my partner smokes cigarettes and can't quit. i feel sad for him.
dealing with my lawyer
i have been very stressed out by all the bad drivers near my house.
Covid 19
i'm tired of stephanie telling me she is going to

I cant stand not having work to do
oh just some stuff
money problem is big problem
i am trying to get full custody of my child and am undergoing a custody evaluation.
the amount of work i have to do in nursing school and hospital clinicals
i'm really stressed about school and all the work that i have to do this week.
my life is falling apart.
the coronavirus
the corono virus - wanting to leave the house but don't want to die
ugh let me tell you
That is so crazy
not having enough money to save, eat, and necessities.
my dad injured his hand about a month, 6 weeks ago
works cutting back everyones hours its driving me crazy this time of year i need $$
trying to divorce as soon as possible
this house remodel. it's a lot to take on alone
Well you know my father who has dementia has entered a new stage with this disease.
it really feels like my family is a mess sometimes
hope to pay my everday bills
I've been experiencing a lot of migraines lately.
i'm worried about applying for residencies
I

i've been very stressed about work and things with my partner are not going well.
i have a bunch of patients that suck right now and because of it i have a bad attitude.
my sister-in-law has been diagnosed with ovarian cancer
my income is not proper and its difficult for me to run a normal life with specific need
i am concerned about work slowing down and losing hours
i need to have some car repairs done, but, have no time to make an appointment.
Fear about the virus and my job.
the thing that stressed me out the most has been my follow up doctor's appointment
tryiong to figure out how to make more money to support my family
I must write a 5 page paper and it due Monday 6 of April, which I need help on it.
I would reply to the friend and ask them to tell me about the situation as I listened.
my co-worker is on vacation and i have to do her job this week.
not haveing enought too pay every day bills only 905 rent is 860 a month i pay ultuties
we have been struggling paying our bills so m

hi dr..i just so stressful due to my project work.because the dead line is over .
i just can't seem to make enough money for this family.
my family in the holidays. absolutely excruciating time of year full of drama.
money, as always.  always worried about money.
i have been stressing out about the upcoming hospitalization.
i am stressed about getting my application decisions
taking care of three kids, has been very stressful.
I have been worried about when things will be normal again.  I don't know when I'll have
Been stressed at work.
i would tell her
i have been conducting a job search
Coronavirus has been stressing me out due to being in furlough
not enough money to pay the rent
wow. thanks for asking. i have been stressing out about my student loan.
I have been stressing out about trying to get my son to do his homework.
a lot to do at work right now
I would use the letter listed on the bottom and type out the information to them.
I don't know how I'm going to get all my work done

# Creating a model

Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our classification task. This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning).

To understand the `pooled ouput` vs `sequence output` refer to https://www.kaggle.com/questions-and-answers/86510

In [32]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        
        # probabilities = tf.nn.softmax(logits, axis=-1) ### multiclass case
        probabilities = tf.nn.sigmoid(logits)#### multi-label case
        
        labels = tf.cast(labels, tf.float32)
        tf.logging.info("num_labels:{};logits:{};labels:{}".format(num_labels, logits, labels))
        per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
        loss = tf.reduce_mean(per_example_loss)

        # probabilities = tf.nn.softmax(logits, axis=-1)
        # log_probs = tf.nn.log_softmax(logits, axis=-1)
        #
        # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        #
        # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        # loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, logits, probabilities)


Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction.

In [33]:

def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        #tf.logging.info("*** Features ***")
        #for name in sorted(features.keys()):
        #    tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        is_real_example = None
        if "is_real_example" in features:
             is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
        else:
             is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits, probabilities) = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
            num_labels, use_one_hot_embeddings)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            #tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, probabilities, is_real_example):

                logits_split = tf.split(probabilities, num_labels, axis=-1)
                label_ids_split = tf.split(label_ids, num_labels, axis=-1)
                
                normalize_logits = tf.nn.l2_normalize(logits_split,0)        
                normalize_labels = tf.nn.l2_normalize(label_ids_split,0)
                
                cos_similarity_loss=tf.losses.cosine_distance(normalize_logits, normalize_labels, dim=0)
                
                # metrics change to auc of every class
                eval_dict = {}
                for j, logits in enumerate(logits_split):
                    label_id_ = tf.cast(label_ids_split[j], dtype=tf.int32)
                    current_auc, update_op_auc = tf.metrics.auc(label_id_, logits)
                    eval_dict[str(j)] = (current_auc, update_op_auc)
                eval_dict['eval_loss'] = tf.metrics.mean(values=per_example_loss)
                eval_dict['cosine_loss'] = tf.metrics.mean(values=cos_similarity_loss)
                return eval_dict

                ## original eval metrics
                # predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                # accuracy = tf.metrics.accuracy(
                #     labels=label_ids, predictions=predictions, weights=is_real_example)
                # loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
                # return {
                #     "eval_accuracy": accuracy,
                #     "eval_loss": loss,
                # }

            eval_metrics = metric_fn(per_example_loss, label_ids, probabilities, is_real_example)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metric_ops=eval_metrics,
                scaffold=scaffold_fn)
        else:
            print("mode:", mode,"probabilities:", probabilities)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={"probabilities": probabilities},
                scaffold=scaffold_fn)
        return output_spec

    return model_fn

In [34]:
# This function is not used by this file but is still used by the Colab and
# people who depend on it.
def input_fn_builder(features, seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples,len(label_list)], dtype=tf.float32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn


In [35]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [36]:
# Specify outpit directory and number of checkpoint steps to save

# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    keep_checkpoint_max=1,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [37]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels= len(label_list),
  init_checkpoint=BERT_INIT_CHKPNT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': './models/19_06_2020/covid/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fddb4439978>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using config: {'_model_dir': './models/19_06_2020/covid/', '_tf_ran

Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators).

In [38]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

# Training the model

In [39]:
try:
    import shutil
    shutil.rmtree(OUTPUT_DIR) #removes the model
except:
    print('Failed to remove')
    pass

Failed to remove


In [40]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Calling model_fn.






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * htt

Instructions for updating:
Use standard file APIs to delete files with this prefix.
INFO:tensorflow:Loss for final step: 0.2052415.
INFO:tensorflow:Loss for final step: 0.2052415.
Training took time  0:01:43.178566


# Evaluating the model on Test Set

In [41]:
#test = test[test['is_stressor']==0]

test_InputExamples = create_examples(test)
test_features = convert_examples_to_features(examples=test_InputExamples,  max_seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)
test_input_fn = input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

this project that im working on
i haven't been able to find work and things are going bad with school.
Being quarantined in my home because of a widespread infectious disease...
tires
i had to deal with a customer today
i am stressed out financially, i am worried about paying my bills.
making a decision on preschool
i have been sressing about not getting enough hits on turk
i wont be able to see william
i have urgent some work unable come on time
traffic!
Having to pay my student loans
my presentation that's coming up
i am struggling with feeling valuable in the workplace.
life has been all around stressful lately
feeling lonely
not enough money for the bills i have to pay!
Work was stressful today
I am stressed about the pandemic global lockdown
i'm dealing with health issues and am not sure of what is going to happen.
my financial situation is not getting better.
I am so stressed about my finances or rather, lack thereof.
i am having roommate issues.
going to work tomorrow
my product

In [42]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:num_labels:9;logits:Tensor("loss/BiasAdd:0", shape=(?, 9), dtype=float32);labels:Tensor("IteratorGetNext:2", shape=(?, 9), dtype=float32)
INFO:tensorflow:num_labels:9;logits:Tensor("loss/BiasAdd:0", shape=(?, 9), dtype=float32);labels:Tensor("IteratorGetNext:2", shape=(?, 9), dtype=float32)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:**** Trainable Variables ****
Instructions for updating:
dim is deprecated, use axis instead
Instructions for updating:
dim is deprecated, use axis instead
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-06-19T09:52:49Z
INFO:tensorflow:Starting evaluation at 2020-06-19T09:52:49Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow

{'0': 0.9489992,
 '1': 0.99350643,
 '2': 0.9285808,
 '3': 0.99999994,
 '4': 0.97821826,
 '5': 1.0,
 '6': 0.9821299,
 '7': 0.9807331,
 '8': 0.99242425,
 'cosine_loss': 0.14985459,
 'eval_loss': 0.22919266,
 'loss': 0.22965172,
 'global_step': 270}

In [43]:
#fetching all the probabilities for each line of the test set
test_probabilities = [x[0]['probabilities'] for x in zip(estimator.predict(test_input_fn,yield_single_examples=True))]

#taking the argmex for the highest category
test_final_labels = np.array(test_probabilities).argmax(axis=1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:num_labels:9;logits:Tensor("loss/BiasAdd:0", shape=(?, 9), dtype=float32);labels:Tensor("IteratorGetNext:2", shape=(?, 9), dtype=float32)
INFO:tensorflow:num_labels:9;logits:Tensor("loss/BiasAdd:0", shape=(?, 9), dtype=float32);labels:Tensor("IteratorGetNext:2", shape=(?, 9), dtype=float32)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:**** Trainable Variables ****
mode: infer probabilities: Tensor("loss/Sigmoid:0", shape=(?, 9), dtype=float32)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models/19_06_2020/covid/model.ckpt-270
INFO:tensorflow:Restoring parameters from ./models/19_06_2020/covid/model.ckpt-270
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensor

### Classification Report

In [49]:
test[LABEL_COLUMN] = test[LABEL_COLUMN_RAW].apply(lambda x: label_list_text.index(x))

In [47]:
report = pd.DataFrame(classification_report(list(test[LABEL_COLUMN]),list(test_final_labels),zero_division=0, output_dict=True)).T
print(report)

KeyError: 'label_numeric'

### Confusion Matrix

In [None]:
get_confusion_matrix(y_test=test[LABEL_COLUMN],predicted=test_final_labels,labels=label_list_text)

In [130]:
type(test) 

pandas.core.frame.DataFrame

### Exporting test set with probabilities

In [149]:
test, experiment_df = get_test_experiment_df(test)

i am trying to interact with a person who at times is in a bad mood
incredibly tired and i have to be alert until noon today
too many interruptions all day
i need to find a job
tires
classes
this person i work with is driving me crazy.
i have to prep for interviews
productivity
relationships
i'm stressing out about sticking to a work out routine
knowing what to do this weekend
too much work time not enough me time
figuring out what classes i want to take
college
a professor is driving me crazy.
eating well
commuting to work
i'm tired and i don't want to go to work
don't know what to cook for dinner
not understanding these instructions.
i embarrassed myself today
long day
my stomach hurts
i cant help william
my date
i'm tired
i'm not sleeping well
feeling lonely
2 of my foster kittens died
some health decisions
money
my 13 year old hates school and tries everything to avoid going
just travel stuff
i want to finish my paper
i feel lonely
body pain
i'll probably need to work all night
my 

In [150]:
experiment_df

Unnamed: 0,sOrder,Input.text,is_stressor,is_stressor_conf,top_label,second_label,Branch,Above SD-THRESHOLD,SD-THRESHOLD,SD,Other,Everyday Decision Making,Work,Social Relationships,Financial Problem,"Health, Fatigue, or Physical Pain",Emotional Turmoil,Family Issues,School,avg_severity,median_severity,SD_severity,Votes,Source
0,,i am trying to interact with a person who at times is in a bad mood,1,80.00%,Social Relationships,Emotional Turmoil,,,,,0.17216,0.09962,0.067464,0.383982,0.035997,0.069842,0.239306,0.169968,0.055694,2.4,2.0,1.85,5.0,popbots_live
1,,incredibly tired and i have to be alert until noon today,1,100.00%,"Health, Fatigue, or Physical Pain",Emotional Turmoil,,,,,0.131237,0.127069,0.063471,0.067016,0.091299,0.556308,0.133757,0.061309,0.053284,3.4,3.0,2.06,7.0,popbots_live
2,,too many interruptions all day,1,100.00%,Other,Everyday Decision Making,,,,,0.26751,0.215089,0.145852,0.115015,0.02408,0.067879,0.116862,0.072608,0.065142,2.0,2.0,1.1,6.0,popbots_live
3,,i need to find a job,1,80.00%,Work,Financial Problem,,,,,0.046525,0.045402,0.652512,0.046338,0.187315,0.029143,0.054039,0.025342,0.039947,4.0,5.0,2.1,6.0,popbots_live
4,,tires,1,60.00%,Other,Everyday Decision Making,,,,,0.299477,0.132096,0.047311,0.066064,0.096157,0.066994,0.072429,0.09296,0.057447,2.2,1.0,2.64,6.0,popbots_live
5,,classes,1,60.00%,School,Work,,,,,0.084667,0.069088,0.104103,0.064755,0.06525,0.03975,0.076972,0.061338,0.716767,1.0,1.0,0.89,5.0,popbots_live
6,,this person i work with is driving me crazy.,1,100.00%,Work,Social Relationships,,,,,0.059071,0.040662,0.529639,0.139698,0.045665,0.031272,0.109528,0.061438,0.045778,4.0,4.0,1.1,7.0,popbots_live
7,,i have to prep for interviews,1,100.00%,School,Work,,,,,0.079274,0.059947,0.255912,0.052082,0.046506,0.029214,0.060064,0.029441,0.420505,3.8,3.0,2.04,6.0,popbots_live
8,,productivity,1,80.00%,Work,Financial Problem,,,,,0.06401,0.06094,0.620512,0.049966,0.084656,0.026754,0.056028,0.030279,0.04343,4.0,3.0,3.03,5.0,popbots_live
9,,relationships,1,80.00%,Social Relationships,Emotional Turmoil,,,,,0.127226,0.06132,0.065116,0.368747,0.052363,0.054046,0.208713,0.174764,0.054854,2.2,2.0,1.33,6.0,popbots_live


In [151]:
experiment_df.to_csv(EXPERIMENTS_PATH+'/test_with_probabilities.csv') 

In [337]:
experiment_df

Unnamed: 0,Emotional Turmoil,Emotional Turmoil Predicted,Everyday Decision Making,Everyday Decision Making Predicted,Family Issues,Family Issues Predicted,Financial Problem,Financial Problem Predicted,"Health, Fatigue, or Physical Pain","Health, Fatigue, or Physical Pain Predicted",Input.text,Not Stressful,Other,Other Predicted,Predicted label,Predicted label 2,SD_severity,School,School Predicted,Social Relationships,Social Relationships Predicted,Source,Votes,Work,Work Predicted,avg_severity,class_freq,is_stressor,is_stressor_conf,label_numeric,labels,median_severity
0,0.0,0.114931,0.0,0.098465,0.0,0.080458,0.0,0.11006,1.0,0.504092,worrying about some pain in the ribs,0.0,0.0,0.086327,"Health, Fatigue, or Physical Pain",Emotional Turmoil,1.2,0.0,0.059007,0.0,0.061672,popbots_live,5,0.0,0.08179,5.6,129,1,100.00%,6,"Health, Fatigue, or Physical Pain",5
1,0.1,0.114033,0.0,0.128241,0.0,0.076831,0.0,0.094373,0.9,0.500694,poor sleep even if i go to bed on time,0.0,0.0,0.096524,"Health, Fatigue, or Physical Pain",Everyday Decision Making,1.17,0.0,0.07319,0.0,0.06022,popbots_live,6,0.0,0.080524,3.8,129,1,100.00%,6,"Health, Fatigue, or Physical Pain",4
2,0.0,0.055825,0.0,0.060044,0.0,0.031902,0.0,0.029481,0.0,0.041416,im felling pretty ok actually altho there seems to be lots of fault in the software that we're developing for research,0.4,0.0,0.05326,Work,School,1.1,0.2,0.084433,0.0,0.060102,popbots_live,7,0.4,0.274923,1.0,709,1,60.00%,2,Work,1
3,0.1,0.158815,0.0,0.133426,0.1,0.136412,0.0,0.059933,0.0,0.070172,i wont be able to see william,0.4,0.0,0.143647,Social Relationships,Emotional Turmoil,2.65,0.0,0.025021,0.4,0.198908,popbots_live,6,0.0,0.043268,2.6,127,1,60.00%,3,Social Relationships,2
4,0.2,0.045586,0.0,0.029603,0.0,0.022491,0.0,0.068351,0.0,0.020658,my job and career,0.0,0.0,0.024649,Work,Financial Problem,2.56,0.1,0.03177,0.0,0.049875,popbots_live,6,0.7,0.718007,4.2,709,1,100.00%,2,Work,4
5,0.1,0.049114,0.1,0.039015,0.0,0.016896,0.0,0.060511,0.0,0.039299,i'm worried i'm not working hard enough,0.0,0.0,0.031198,Work,Financial Problem,1.02,0.0,0.021771,0.0,0.043759,popbots_live,7,0.8,0.713766,3.4,709,1,100.00%,2,Work,3
6,0.0,0.048315,0.0,0.037417,0.0,0.020295,0.0,0.059854,0.0,0.026324,getting everything done at work,0.0,0.0,0.028738,Work,Financial Problem,1.26,0.0,0.02655,0.0,0.05046,popbots_live,5,1.0,0.789989,4.0,709,1,100.00%,2,Work,3
7,0.0,0.09878,0.0,0.062322,0.6,0.558648,0.0,0.074756,0.3,0.096021,my daughter has been having panic attacks,0.0,0.0,0.07035,Family Issues,Social Relationships,1.2,0.0,0.061349,0.1,0.106905,popbots_live,8,0.0,0.027614,7.6,235,1,100.00%,8,Family Issues,7
8,0.0,0.04941,0.4,0.040245,0.0,0.019267,0.0,0.051543,0.2,0.025123,deadlines,0.0,0.0,0.031007,Work,Social Relationships,2.42,0.0,0.029755,0.0,0.053417,popbots_live,5,0.4,0.770819,3.6,709,1,100.00%,2,Work,3
9,0.0,0.047017,0.0,0.040386,0.0,0.018149,0.0,0.062107,0.2,0.028208,i'll probably need to work all night,0.0,0.0,0.029352,Work,Financial Problem,1.41,0.1,0.026541,0.0,0.045167,popbots_live,8,0.7,0.768633,5.0,709,1,100.00%,2,Work,5


### RUN ALL CELLS ABOVE ON HERE

In [151]:
#experiment_df[experiment_df['Predicted'] != experiment_df['Answer.Label']].head(10) # change head(n) to see more  

KeyError: 'Answer.Label'

# Exporting the model as Pb format

In [41]:
def export_model(dir_path):
    def serving_input_receiver_fn():
        """An input receiver that expects a serialized tf.Example."""
        reciever_tensors = {
            "input_ids": tf.placeholder(dtype=tf.int32,
                                        shape=[1, MAX_SEQ_LEN])
        }
        features = {
            "label_ids":tf.placeholder(tf.int32, [None], name='label_ids'),
            "input_ids": reciever_tensors['input_ids'],
            "input_mask": 1 - tf.cast(tf.equal(reciever_tensors['input_ids'], 0), dtype=tf.int32),
            "segment_ids": tf.zeros(dtype=tf.int32,
                                    shape=[1, MAX_SEQ_LEN])
            
        }
        return tf.estimator.export.ServingInputReceiver(features, reciever_tensors)

    estimator._export_to_tpu = False
    estimator.export_saved_model(dir_path, serving_input_receiver_fn)

In [114]:
export_model('./tfmode/pbformat/')

NameError: name 'export_model' is not defined

## Getting analysis for a another dataset

In [152]:
test_all_live = pd.read_csv(PATH+DATASET_NAME+'.csv')
test_all_live = test_all_live[test_all_live['Source'] == LIVE_NAME]
#test_all_live
test_all_live, experiment_df_live = get_test_experiment_df(test_all_live)

2 of my foster kittens died
i made a mistake
traffic!
feeling lonely
i feel lonely
my foster kitten is dying.
i embarrassed myself today
being unproductive
i'm worried that i'm wasting food
too many interruptions all day
my lease is ending soon
moving
time management
so much to do
a lot on my to do list
i need to figure out storage
productivity
packing for my trips
i have some tasks that i keep putting off
i'm worried about completeting all my tasks today
feeling overwhelmed
my kids spent too much money at the event today
my 13 year old hates school and tries everything to avoid going
my daughter has been having panic attacks
big trip home
my son and sleep
my child
making a decision on preschool
my daughter sarah
my marriage
money
i have a friend coming over and i'm stressed about a big expense
my stomach hurts
worrying about some pain in the ribs
getting ebola.
poor sleep even if i go to bed on time
i'm not sleeping well
i'm tired
i have a terrible headache
deciding if i should work o

In [153]:
experiment_df_live.to_csv(EXPERIMENTS_PATH+'/test_with_probabilities.csv') 

In [154]:
experiment_df_live

Unnamed: 0,sOrder,Input.text,is_stressor,is_stressor_conf,top_label,second_label,Branch,Above SD-THRESHOLD,SD-THRESHOLD,SD,Other,Everyday Decision Making,Work,Social Relationships,Financial Problem,"Health, Fatigue, or Physical Pain",Emotional Turmoil,Family Issues,School,avg_severity,median_severity,SD_severity,Votes,Source
0,,2 of my foster kittens died,1,100.00%,Family Issues,Social Relationships,,,,,0.099461,0.059411,0.035715,0.161233,0.095379,0.059143,0.127384,0.546898,0.048009,8.2,8.0,1.6,6.0,popbots_live
1,,i made a mistake,1,100.00%,Other,Emotional Turmoil,,,,,0.211172,0.090374,0.045234,0.117001,0.050715,0.057799,0.117945,0.087511,0.042616,4.0,4.0,0.89,5.0,popbots_live
2,,traffic!,1,100.00%,Other,Everyday Decision Making,,,,,0.33563,0.148663,0.099182,0.078876,0.053527,0.068947,0.081893,0.073878,0.072303,4.4,4.0,1.62,6.0,popbots_live
3,,feeling lonely,1,100.00%,Social Relationships,Family Issues,,,,,0.168729,0.081911,0.040129,0.234337,0.044177,0.054235,0.169007,0.169644,0.040048,3.8,4.0,1.6,6.0,popbots_live
4,,i feel lonely,1,100.00%,Social Relationships,Family Issues,,,,,0.159239,0.079622,0.038107,0.246978,0.048148,0.056321,0.177447,0.199345,0.041478,5.8,6.0,2.32,9.0,popbots_live
5,,my foster kitten is dying.,1,100.00%,Family Issues,Social Relationships,,,,,0.093452,0.062245,0.036733,0.156576,0.092347,0.075203,0.127673,0.561083,0.048084,7.0,7.0,0.63,8.0,popbots_live
6,,i embarrassed myself today,1,100.00%,Other,"Health, Fatigue, or Physical Pain",,,,,0.204243,0.111206,0.03287,0.124865,0.036189,0.191693,0.148444,0.071333,0.046289,2.0,2.0,0.63,5.0,popbots_live
7,,being unproductive,1,100.00%,Other,Everyday Decision Making,,,,,0.369817,0.207506,0.08452,0.135191,0.035513,0.081452,0.132535,0.091416,0.060687,2.6,3.0,1.02,5.0,popbots_live
8,,i'm worried that i'm wasting food,1,100.00%,"Health, Fatigue, or Physical Pain",Everyday Decision Making,,,,,0.151771,0.157974,0.054371,0.059482,0.102166,0.516383,0.121924,0.066598,0.052469,1.6,1.0,0.8,5.0,popbots_live
9,,too many interruptions all day,1,100.00%,Other,Everyday Decision Making,,,,,0.26751,0.215089,0.145852,0.115015,0.02408,0.067879,0.116862,0.072608,0.065142,2.0,2.0,1.1,6.0,popbots_live
