In [1]:
import os
from bs4 import BeautifulSoup 
import re

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import HashingVectorizer

from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
 The versions of TensorFlow you are currently using is 2.8.4 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


## Helper Functions for Data

In [2]:
def extract_filenames(path):
    files = [] 
    for filename in os.listdir(path):
        if not filename.endswith('.xml'):
            continue
        filepath = os.path.join(path, filename)
        files.append(filepath)
    return files

def extract_train_chunks():
    dataframe_collection = {} 
    for ctr in range(1,11):
        positive_file_path = "../dataset/2018 train/positive_examples/chunk"+str(ctr)
        negative_file_path = "../dataset/2018 train/negative_examples/chunk"+str(ctr)
        positive_files = extract_filenames(positive_file_path)
        negative_files = extract_filenames(negative_file_path)
        files = positive_files + negative_files
        data_list = []
        for file in files:
            if 'positive' in file:
                label = 1
            elif 'negative' in file:
                label = 0
            fd = open(file,'r')
            data = fd.read()
            soup = BeautifulSoup(data,'xml')
            subject_id = soup.find('ID')
            writings = soup.find_all('WRITING')
            title = ''
            text = ''
            for writing in writings:
                title = title + writing.find('TITLE').get_text() + ' '
                text = text + writing.find('TEXT').get_text() + ' '
                row = [subject_id.get_text(), title, text, label]
            data_list.append(row)
        chunk_name = 'chunk'+str(ctr)
        dataframe_collection[chunk_name] = pd.DataFrame(data_list, columns = ['subject_id', 'title', 'text', 'label'])
    return dataframe_collection


def extract_test_chunks():
    dataframe_collection = {} 
    for ctr in range(1,11):
        file_path = "../dataset/2018 test/chunk"+str(ctr)
        files = extract_filenames(file_path)
        data_list = []
        for file in files:
            fd = open(file,'r')
            data = fd.read()
            soup = BeautifulSoup(data,'xml')
            subject_id = soup.find('ID')
            writings = soup.find_all('WRITING')
            title = ''
            text = ''
            for writing in writings:
                title = title + writing.find('TITLE').get_text() + ' '
                text = text + writing.find('TEXT').get_text() + ' '
                row = [subject_id.get_text(), title, text]
            data_list.append(row)
        chunk_name = 'chunk'+str(ctr)
        dataframe_collection[chunk_name] = pd.DataFrame(data_list, columns = ['subject_id', 'title', 'text'])
    return dataframe_collection

def stemSentence(sentence):
    lemmatizer=WordNetLemmatizer()
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(lemmatizer.lemmatize(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

# Preprocess, encode data (word embeddings) for every chunk
def preprocess_data(df):
#   TITLE CLEAN
    df['title_clean'] = df['title'].loc[df['title'] ==  ' [removed] '] = ' '
    df['title_clean'] = df['title'].str.lower()
    df['title_clean'] = df['title_clean'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df['title_clean'] = df['title_clean'].apply(lambda elem: re.sub(r"\d+", "", elem))
    # remove duplicate spaces
    df['title_clean'] = df['title_clean'].apply(lambda elem: re.sub(' +', ' ', elem))
    # remove stop words
    df['title_clean'] = df['title_clean'].apply(lambda elem: remove_stopwords(elem))
    df['title_clean'] = df['title_clean'].apply(lambda elem: stemSentence(elem))
    
#   TEXT CLEAN
    df['text_clean'] = df['text'].loc[df['title'] ==  ' [removed] '] = ' '
    df['text_clean'] = df['text'].str.lower()
    df['text_clean'] = df['text_clean'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df['text_clean'] = df['text_clean'].apply(lambda elem: re.sub(r"\d+", "", elem))
    # remove duplicate spaces
    df['text_clean'] = df['text_clean'].apply(lambda elem: re.sub(' +', ' ', elem))
    # remove stop words
    df['text_clean'] = df['text_clean'].apply(lambda elem: remove_stopwords(elem))
    df['text_clean'] = df['text_clean'].apply(lambda elem: stemSentence(elem))
    
    df['final_text'] = df['title_clean'] + df['text_clean']
    
    final_dataset = pd.DataFrame(df['subject_id'])
    final_dataset['text'] = df['title_clean'] + ' ' + df['final_text']
    if 'label' in df.columns:
        final_dataset['label'] = df['label']
    
    return final_dataset

## Helper Functions for Training

In [32]:
# Function for Bert Model creation
def build_bert_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4', trainable=True)
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dense(100, name='features')(net)
    net = tf.keras.layers.Dense(50)(net)
    net = tf.keras.layers.Dense(10)(net)
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    model = tf.keras.Model(text_input, net)
    
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()
    model.compile(optimizer='Adagrad',
                       loss=loss,
                       metrics=metrics)

    return model

def train(train_df_collection, model):
    metrics_list = []
    
    # Use only first 9 chunks for training and 10th chunk for validation 
    for chunk in range (len(train_df_collection)-1):
#     for chunk in range (2):
        chunk_name = 'chunk'+str(chunk+1)
        print('Training ' + chunk_name + '...')
        df = preprocess_data(train_df_collection[chunk_name])
        X_train = df['text']
        Y_train = df['label']
        
        model.fit(X_train, Y_train, class_weight={0:0.5,1:3.8}, epochs=5)
#         model.fit(X_train, Y_train, epochs=1)
        
        predictions_probs = model.predict(X_train)
        predictions = np.where(predictions_probs > 0.5, 1, 0)
     
        score = f1_score(Y_train, predictions, average='weighted')
        metrics_list.append(score)
        
        print ('F1 Score :',f1_score(Y_train, predictions, average=None))
    metrics_df = pd.DataFrame(metrics_list, columns=['F1_score'])
    
    return model

def validate(df, model):
    
    df = preprocess_data(df)
    X_val = df['text']
    Y_val = df['label']
    
    predictions_probs = model.predict(X_val)
    predictions = np.where(predictions_probs > 0.5, 1, 0)

    print()
    print(classification_report(Y_val, predictions, target_names=['Non-Anorexic', 'Anorexic']))
    
    
def test_model(test_chunk_collection, test_labels, model):
    
    for chunk in range(1,11):
        chunk_name = 'chunk'+str(chunk)
        chunk_df = test_chunk_collection[chunk_name]

        # preprocess, vectorize and predict
        clean_df = preprocess_data(chunk_df)

        X_test = clean_df.text
        
        chunk_probs = model.predict(X_test)
        chunk_pred = np.where(chunk_probs > 0.5, 1, 0)


        # Save prediction
        pred_df = pd.DataFrame(chunk_pred, columns=['pred'])
        pred_df.pred = pred_df.pred.astype('int')

        # save predictions to dataframe
        chunks_pred_df = pd.DataFrame(clean_df['subject_id'])
        chunks_pred_df['pred'] = pred_df['pred'].values

    # Map chunk predictions with truth labels
    test_pred_list = []
    for sub in chunks_pred_df['subject_id']:
        value = test_labels.loc[test_labels['subject_id']==sub]['label'].values[0]
        value_list = [sub, value]
        test_pred_list.append(value_list)
    final_test_pred = pd.DataFrame(test_pred_list, columns=['subject_id', 'label'])    
    
    # Print classification report
    print(classification_report(final_test_pred['label'], chunks_pred_df['pred']))

## Extract Data

In [4]:
# Train Data - Chunks 1 - 9
train_dataframe_collection = extract_train_chunks()

# Validation Data - Chunk 10
val_df = train_dataframe_collection['chunk10']

# Test Data
test_dataframe_collection = extract_test_chunks()
test_truth_labels = pd.read_csv('../dataset/2018 test/risk-golden-truth-test.csv')

### Class weights

In [5]:
class_weights = compute_class_weight('balanced', classes=np.array([0,1]), y=val_df['label'])
print((class_weights))

[0.57575758 3.8       ]


## BERT Classifier

In [33]:
bert_clf = build_bert_model()

In [34]:
bert_clf.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [35]:
bert_online = train(train_dataframe_collection, bert_clf)

Training chunk1...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.         0.23255814]
Training chunk2...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.93023256 0.60869565]
Training chunk3...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.70588235 0.4       ]
Training chunk4...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.91358025 0.6557377 ]
Training chunk5...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.95275591 0.76      ]
Training chunk6...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.94071146 0.70588235]
Training chunk7...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.93172691 0.69090909]
Training chunk8...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.98069498 0.88888889]
Training chunk9...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 Score : [0.9609375  0.79166667]


In [38]:
validate(val_df, bert_clf)


              precision    recall  f1-score   support

Non-Anorexic       0.95      0.94      0.94       132
    Anorexic       0.62      0.65      0.63        20

    accuracy                           0.90       152
   macro avg       0.78      0.79      0.79       152
weighted avg       0.90      0.90      0.90       152



In [39]:
test_model(test_dataframe_collection, test_truth_labels, bert_clf)

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       279
           1       0.67      0.63      0.65        41

    accuracy                           0.91       320
   macro avg       0.81      0.79      0.80       320
weighted avg       0.91      0.91      0.91       320

