In [2]:
import os
from bs4 import BeautifulSoup 
import re

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import HashingVectorizer

from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

import warnings
warnings.filterwarnings('ignore')

## Helper Functions for Data

In [3]:
def extract_filenames(path):
    files = [] 
    for filename in os.listdir(path):
        if not filename.endswith('.xml'):
            continue
        filepath = os.path.join(path, filename)
        files.append(filepath)
    return files

def extract_train_chunks():
    dataframe_collection = {} 
    for ctr in range(1,11):
        positive_file_path = "/Users/rajithamuthukrishnan/Desktop/uOttawa/Project_CSI6900/dataset/erisk 2019/eRisk2019_T1/training data - t1/2018 train/positive_examples/chunk"+str(ctr)
        negative_file_path = "/Users/rajithamuthukrishnan/Desktop/uOttawa/Project_CSI6900/dataset/erisk 2019/eRisk2019_T1/training data - t1/2018 train/negative_examples/chunk"+str(ctr)
        positive_files = extract_filenames(positive_file_path)
        negative_files = extract_filenames(negative_file_path)
        files = positive_files + negative_files
        data_list = []
        for file in files:
            if 'positive' in file:
                label = 1
            elif 'negative' in file:
                label = 0
            fd = open(file,'r')
            data = fd.read()
            soup = BeautifulSoup(data,'xml')
            subject_id = soup.find('ID')
            writings = soup.find_all('WRITING')
            title = ''
            text = ''
            for writing in writings:
                title = title + writing.find('TITLE').get_text() + ' '
                text = text + writing.find('TEXT').get_text() + ' '
                row = [subject_id.get_text(), title, text, label]
            data_list.append(row)
        chunk_name = 'chunk'+str(ctr)
        dataframe_collection[chunk_name] = pd.DataFrame(data_list, columns = ['subject_id', 'title', 'text', 'label'])
    return dataframe_collection


def extract_test_chunks():
    dataframe_collection = {} 
    for ctr in range(1,11):
        file_path = "/Users/rajithamuthukrishnan/Desktop/uOttawa/Project_CSI6900/dataset/erisk 2019/eRisk2019_T1/training data - t1/2018 test/chunk"+str(ctr)
        files = extract_filenames(file_path)
        data_list = []
        for file in files:
            fd = open(file,'r')
            data = fd.read()
            soup = BeautifulSoup(data,'xml')
            subject_id = soup.find('ID')
            writings = soup.find_all('WRITING')
            title = ''
            text = ''
            for writing in writings:
                title = title + writing.find('TITLE').get_text() + ' '
                text = text + writing.find('TEXT').get_text() + ' '
                row = [subject_id.get_text(), title, text]
            data_list.append(row)
        chunk_name = 'chunk'+str(ctr)
        dataframe_collection[chunk_name] = pd.DataFrame(data_list, columns = ['subject_id', 'title', 'text'])
    return dataframe_collection

def stemSentence(sentence):
    lemmatizer=WordNetLemmatizer()
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(lemmatizer.lemmatize(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

# Preprocess, encode data (word embeddings) for every chunk
def preprocess_data(df):
#   TITLE CLEAN
    df['title_clean'] = df['title'].loc[df['title'] ==  ' [removed] '] = ' '
    df['title_clean'] = df['title'].str.lower()
    df['title_clean'] = df['title_clean'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df['title_clean'] = df['title_clean'].apply(lambda elem: re.sub(r"\d+", "", elem))
    # remove duplicate spaces
    df['title_clean'] = df['title_clean'].apply(lambda elem: re.sub(' +', ' ', elem))
    # remove stop words
    df['title_clean'] = df['title_clean'].apply(lambda elem: remove_stopwords(elem))
    df['title_clean'] = df['title_clean'].apply(lambda elem: stemSentence(elem))
    
#   TEXT CLEAN
    df['text_clean'] = df['text'].loc[df['title'] ==  ' [removed] '] = ' '
    df['text_clean'] = df['text'].str.lower()
    df['text_clean'] = df['text_clean'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df['text_clean'] = df['text_clean'].apply(lambda elem: re.sub(r"\d+", "", elem))
    # remove duplicate spaces
    df['text_clean'] = df['text_clean'].apply(lambda elem: re.sub(' +', ' ', elem))
    # remove stop words
    df['text_clean'] = df['text_clean'].apply(lambda elem: remove_stopwords(elem))
    df['text_clean'] = df['text_clean'].apply(lambda elem: stemSentence(elem))
    
    df['final_text'] = df['title_clean'] + df['text_clean']
    
    final_dataset = pd.DataFrame(df['subject_id'])
    final_dataset['text'] = df['title_clean'] + ' ' + df['final_text']
    if 'label' in df.columns:
        final_dataset['label'] = df['label']
    
    return final_dataset

## Helper Functions for Training

In [4]:
# Function for Hashing Vectorizer
def vectorize_data(text_input, vectorizer, vec_type):

    if vec_type == 'Hash':
        data_vectorized = vectorizer.transform(text_input)

    elif vec_type == 'BERT':
        data_vectorized = vectorizer.predict(text_input)
        
    return data_vectorized

# Function for model training with Hashing Vectorizer
def train(train_df_collection, model):
    vectorizer = HashingVectorizer()
    metrics_list = []
        
    # Use only first 9 chunks for training and 10th chunk for validation     
    for chunk in range (len(train_df_collection) - 1):
        chunk_name = 'chunk'+str(chunk+1)
        print('Training ' + chunk_name + '...')
        df = preprocess_data(train_df_collection[chunk_name])
        
        train_input = df['text']
        train_label = df['label']
        
        vectorizer.partial_fit(train_input)
        
        X_train = vectorize_data(train_input, vectorizer, vec_type='Hash')
        Y_train = train_label
            
        if 'SGDClassifier' in str(type(model)):
            if chunk_name == 'chunk1':
                model.fit(X_train, Y_train)
            else:
                model.partial_fit(X_train, Y_train)
        if 'LogisticRegression' in str(type(model)):
            model.fit(X_train, Y_train)

        predictions = model.predict(X_train)
            
        score = f1_score(Y_train, predictions, average='weighted')
        metrics_list.append(score)
        
        print ('F1 Score :',f1_score(Y_train, predictions, average=None))
        
    metrics_df = pd.DataFrame(metrics_list, columns=['F1_score'])
    return vectorizer
    

# Function for Bert Model creation
def build_bert_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4', trainable=True)
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dense(100, name='features')(net)
    net = tf.keras.layers.Dense(75)(net)
    net = tf.keras.layers.Dense(50)(net)
    net = tf.keras.layers.Dense(25)(net)
    net = tf.keras.layers.Dense(10)(net)
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    model = tf.keras.Model(text_input, net)
    
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()

    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=1000,
                                              num_warmup_steps=100,
                                              optimizer_type='adamw')
    model.compile(optimizer=optimizer,
                       loss=loss,
                       metrics=metrics)

    return model

def train_bert(train_df_collection, model):
    metrics_list = []
    bert_model = build_bert_model()
    
    # Use only first 9 chunks for training and 10th chunk for validation 
    for chunk in range (len(train_df_collection)-1):
        chunk_name = 'chunk'+str(chunk+1)
        print('Training ' + chunk_name + '...')
        df = preprocess_data(train_df_collection[chunk_name])
        train_input = df['text']
        train_label = df['label']
        
        bert_model.fit(train_input, train_label, class_weight={0:0.5,1:3.8}, epochs=5, verbose=0)
        
        vectorizer = Model(bert_model.input, outputs=bert_model.get_layer('features').output)
        
        X_train = vectorize_data(train_input, vectorizer, vec_type='BERT')
        Y_train = train_label     
                
        if 'SGDClassifier' in str(type(model)):
            if chunk_name == 'chunk1':
                model.fit(X_train, Y_train)
            else:
                model.partial_fit(X_train, Y_train)
        if 'LogisticRegression' in str(type(model)):
            model.fit(X_train, Y_train)

        predictions = model.predict(X_train)
            
        score = f1_score(Y_train, predictions, average='weighted')
        metrics_list.append(score)
        
        print ('F1 Score :',f1_score(Y_train, predictions, average=None))
    metrics_df = pd.DataFrame(metrics_list, columns=['F1_score'])
    
    return vectorizer

def validate(df, model, vectorizer, vec_type):
    
    df = preprocess_data(df)
    val_input = df['text']
    val_label = df['label']
    
    X_val = vectorize_data(val_input, vectorizer, vec_type=vec_type)
    Y_val = val_label
    
    predictions = model.predict(X_val)

    print()
    print(classification_report(Y_val, predictions, target_names=['Non-Anorexic', 'Anorexic']))
    
    
def test_model(test_chunk_collection, test_labels, model, vectorizer, vec_type):
    
    for chunk in range(1,11):
        chunk_name = 'chunk'+str(chunk)
        chunk_df = test_chunk_collection[chunk_name]

        # preprocess, vectorize and predict
        clean_df = preprocess_data(chunk_df)

        X_test = vectorize_data(clean_df.text, vectorizer, vec_type=vec_type)

        chunk_pred = model.predict(X_test)

        # Save prediction
        pred_df = pd.DataFrame(chunk_pred, columns=['pred'])
        pred_df.pred = pred_df.pred.astype('int')

        # save predictions to dataframe
        chunks_pred_df = pd.DataFrame(clean_df['subject_id'])
        chunks_pred_df['pred'] = pred_df['pred'].values

    # Map chunk predictions with truth labels
    test_pred_list = []
    for sub in chunks_pred_df['subject_id']:
        value = test_labels.loc[test_labels['subject_id']==sub]['label'].values[0]
        value_list = [sub, value]
        test_pred_list.append(value_list)
    final_test_pred = pd.DataFrame(test_pred_list, columns=['subject_id', 'label'])    
    
    # Print classification report
    print(classification_report(final_test_pred['label'], chunks_pred_df['pred']))

## Extract Data

In [5]:
# Train Data - Chunks 1 - 9
train_dataframe_collection = extract_train_chunks()

# Validation Data - Chunk 10
val_df = train_dataframe_collection['chunk10']

# Test Data
test_dataframe_collection = extract_test_chunks()
test_truth_labels = pd.read_csv('/Users/rajithamuthukrishnan/Desktop/uOttawa/Project_CSI6900/dataset/erisk 2019/eRisk2019_T1/training data - t1/2018 test/risk-golden-truth-test.csv')

### Class weights

In [6]:
class_weights = compute_class_weight('balanced', classes=np.array([0,1]), y=val_df['label'])
print((class_weights))

[0.57575758 3.8       ]


## Models - Hash Vectorizer

### SGD Classifier

In [7]:
sgd_clf = SGDClassifier(loss='log_loss', class_weight={0:0.57, 1:3.8}, warm_start=True, learning_rate='adaptive', eta0=2)
sgd_vectorizer = train(train_dataframe_collection, sgd_clf)

Training chunk1...
F1 Score : [1. 1.]
Training chunk2...
F1 Score : [0.78899083 0.46511628]
Training chunk3...
F1 Score : [0.97297297 0.84444444]
Training chunk4...
F1 Score : [0.94820717 0.75471698]
Training chunk5...
F1 Score : [0.91358025 0.6557377 ]
Training chunk6...
F1 Score : [0.97276265 0.85106383]
Training chunk7...
F1 Score : [0.96470588 0.81632653]
Training chunk8...
F1 Score : [0.97276265 0.85106383]
Training chunk9...
F1 Score : [0.98069498 0.88888889]


In [8]:
validate(val_df, sgd_clf, sgd_vectorizer, vec_type='Hash')


              precision    recall  f1-score   support

Non-Anorexic       0.98      0.93      0.95       132
    Anorexic       0.65      0.85      0.74        20

    accuracy                           0.92       152
   macro avg       0.82      0.89      0.85       152
weighted avg       0.93      0.92      0.93       152



In [9]:
test_model(test_dataframe_collection, test_truth_labels, sgd_clf, sgd_vectorizer, vec_type='Hash')

Saved predictions to "test_predictions" folder
              precision    recall  f1-score   support

           0       0.97      0.88      0.92       279
           1       0.50      0.80      0.62        41

    accuracy                           0.87       320
   macro avg       0.73      0.84      0.77       320
weighted avg       0.91      0.87      0.88       320



### Logistic Regression

In [10]:
lr_clf = LogisticRegression(solver='lbfgs', class_weight='balanced', warm_start=True)
lr_vectorizer = train(train_dataframe_collection, lr_clf)

Training chunk1...
F1 Score : [0.99619772 0.97560976]
Training chunk2...
F1 Score : [0.98850575 0.93023256]
Training chunk3...
F1 Score : [0.99236641 0.95238095]
Training chunk4...
F1 Score : [1. 1.]
Training chunk5...
F1 Score : [0.99619772 0.97560976]
Training chunk6...
F1 Score : [0.99236641 0.95238095]
Training chunk7...
F1 Score : [1. 1.]
Training chunk8...
F1 Score : [0.99236641 0.95238095]
Training chunk9...
F1 Score : [1. 1.]


In [11]:
validate(val_df, lr_clf, lr_vectorizer, vec_type='Hash')


              precision    recall  f1-score   support

Non-Anorexic       0.94      0.99      0.96       132
    Anorexic       0.92      0.55      0.69        20

    accuracy                           0.93       152
   macro avg       0.93      0.77      0.83       152
weighted avg       0.93      0.93      0.93       152



In [12]:
test_model(test_dataframe_collection, test_truth_labels, lr_clf, lr_vectorizer, vec_type='Hash')

Saved predictions to "test_predictions" folder
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       279
           1       0.74      0.56      0.64        41

    accuracy                           0.92       320
   macro avg       0.84      0.77      0.80       320
weighted avg       0.91      0.92      0.91       320



### SVM

In [13]:
svm_clf = SGDClassifier(loss='hinge', penalty='l2', class_weight={0:0.57, 1:3.8}, warm_start=True)
svm_vectorizer = train(train_dataframe_collection,svm_clf)

Training chunk1...
F1 Score : [1. 1.]
Training chunk2...
F1 Score : [0.936     0.7037037]
Training chunk3...
F1 Score : [0.98084291 0.88372093]
Training chunk4...
F1 Score : [0.96923077 0.81818182]
Training chunk5...
F1 Score : [0.97276265 0.85106383]
Training chunk6...
F1 Score : [0.94820717 0.75471698]
Training chunk7...
F1 Score : [0.94820717 0.75471698]
Training chunk8...
F1 Score : [0.98069498 0.88888889]
Training chunk9...
F1 Score : [0.944      0.74074074]


In [14]:
validate(val_df, svm_clf, svm_vectorizer, vec_type='Hash')


              precision    recall  f1-score   support

Non-Anorexic       0.98      0.86      0.92       132
    Anorexic       0.50      0.90      0.64        20

    accuracy                           0.87       152
   macro avg       0.74      0.88      0.78       152
weighted avg       0.92      0.87      0.88       152



In [15]:
test_model(test_dataframe_collection, test_truth_labels, svm_clf, svm_vectorizer, vec_type='Hash')

Saved predictions to "test_predictions" folder
              precision    recall  f1-score   support

           0       0.97      0.82      0.89       279
           1       0.40      0.83      0.54        41

    accuracy                           0.82       320
   macro avg       0.69      0.83      0.72       320
weighted avg       0.90      0.82      0.85       320



## Models - BERT as Vectorizer

### SGD Classifier

In [16]:
sgd_bert_clf = SGDClassifier(loss='log_loss', class_weight={0:0.57, 1:3.8}, warm_start=True, learning_rate='adaptive', eta0=2)
sgd_bert_vectorizer = train_bert(train_dataframe_collection, sgd_bert_clf)

Training chunk1...
F1 Score : [0.93650794 0.69230769]
Training chunk2...
F1 Score : [0.98084291 0.88372093]
Training chunk3...
F1 Score : [1. 1.]
Training chunk4...
F1 Score : [0.98069498 0.88888889]
Training chunk5...
F1 Score : [0.98867925 0.92307692]
Training chunk6...
F1 Score : [0.99619772 0.97560976]
Training chunk7...
F1 Score : [0.98113208 0.87179487]
Training chunk8...
F1 Score : [0.99242424 0.95      ]
Training chunk9...
F1 Score : [0.98484848 0.9       ]


In [17]:
validate(val_df, sgd_bert_clf, sgd_bert_vectorizer, vec_type='BERT')


              precision    recall  f1-score   support

Non-Anorexic       0.97      0.89      0.92       132
    Anorexic       0.52      0.80      0.63        20

    accuracy                           0.88       152
   macro avg       0.74      0.84      0.78       152
weighted avg       0.91      0.88      0.89       152



In [18]:
test_model(test_dataframe_collection, test_truth_labels, sgd_bert_clf, sgd_bert_vectorizer, vec_type='BERT')

Saved predictions to "test_predictions" folder
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       279
           1       0.47      0.68      0.56        41

    accuracy                           0.86       320
   macro avg       0.71      0.79      0.74       320
weighted avg       0.89      0.86      0.87       320



### Logistic Regression

In [19]:
lr_bert_clf = LogisticRegression(solver='lbfgs', class_weight='balanced', warm_start=True)
lr_bert_vectorizer = train_bert(train_dataframe_collection, lr_bert_clf)

Training chunk1...
F1 Score : [0.94208494 0.66666667]
Training chunk2...
F1 Score : [0.96946565 0.80952381]
Training chunk3...
F1 Score : [0.99236641 0.95238095]
Training chunk4...
F1 Score : [1. 1.]
Training chunk5...
F1 Score : [0.98850575 0.93023256]
Training chunk6...
F1 Score : [1. 1.]
Training chunk7...
F1 Score : [0.99619772 0.97560976]
Training chunk8...
F1 Score : [1. 1.]
Training chunk9...
F1 Score : [0.99242424 0.95      ]


In [20]:
validate(val_df, lr_bert_clf, lr_bert_vectorizer, vec_type='BERT')


              precision    recall  f1-score   support

Non-Anorexic       0.95      0.95      0.95       132
    Anorexic       0.68      0.65      0.67        20

    accuracy                           0.91       152
   macro avg       0.82      0.80      0.81       152
weighted avg       0.91      0.91      0.91       152



In [21]:
test_model(test_dataframe_collection, test_truth_labels, lr_bert_clf, lr_bert_vectorizer, vec_type='BERT')

Saved predictions to "test_predictions" folder
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       279
           1       0.65      0.68      0.67        41

    accuracy                           0.91       320
   macro avg       0.80      0.81      0.81       320
weighted avg       0.91      0.91      0.91       320



### SVM

In [22]:
svm_bert_clf = SGDClassifier(loss='hinge', penalty='l2', class_weight={0:0.57, 1:3.8}, warm_start=True)
svm_bert_vectorizer = train_bert(train_dataframe_collection, svm_bert_clf)

Training chunk1...
F1 Score : [0.29677419 0.26845638]
Training chunk2...
F1 Score : [0.93650794 0.69230769]
Training chunk3...
F1 Score : [0.98069498 0.88888889]
Training chunk4...
F1 Score : [1. 1.]
Training chunk5...
F1 Score : [0.98867925 0.92307692]
Training chunk6...
F1 Score : [0.99619772 0.97560976]
Training chunk7...
F1 Score : [0.99619772 0.97560976]
Training chunk8...
F1 Score : [1. 1.]
Training chunk9...
F1 Score : [1. 1.]


In [23]:
validate(val_df, svm_bert_clf, svm_bert_vectorizer, vec_type='BERT')


              precision    recall  f1-score   support

Non-Anorexic       0.93      0.96      0.95       132
    Anorexic       0.69      0.55      0.61        20

    accuracy                           0.91       152
   macro avg       0.81      0.76      0.78       152
weighted avg       0.90      0.91      0.90       152



In [24]:
test_model(test_dataframe_collection, test_truth_labels, svm_bert_clf, svm_bert_vectorizer, vec_type='BERT')

Saved predictions to "test_predictions" folder
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       279
           1       0.73      0.46      0.57        41

    accuracy                           0.91       320
   macro avg       0.83      0.72      0.76       320
weighted avg       0.90      0.91      0.90       320

