#Exercise 1 



In [None]:
# unzip the file
# !unzip train.zip
# !unzip test.zip

In [None]:
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
plt.style.use('ggplot')
import os, json
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import matplotlib
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.regularizers import l1
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,RNN, SimpleRNN,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report
stop=set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


In [None]:
# Function used to create the dataframe
def create_dataframe(dir_name):
  path_to_json = dir_name+'/'
  json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
  # print(json_files)

  # here I define pandas Dataframe with the columns name from the json
  jsons_data = pd.DataFrame(columns=['abstract', 'categories'])

  # we need both the json and an index number so use enumerate()
  for index, js in enumerate(json_files):
    try:
      with open(os.path.join(path_to_json, js)) as json_file:
          json_text = json.load(json_file)

          abstract = json_text['abstract']
          categories = json_text['categories']
          
          # here I push a list of data into a pandas DataFrame at row given by 'index'
          jsons_data.loc[index] = [abstract, categories]

    except Exception as e:
        print('Exception :', str(e))    

  return jsons_data

In [None]:
## Create the train and test dataframe
train_df = create_dataframe("train")
train_df

test_df = create_dataframe("test")
test_df


In [None]:
# Export the dataframe into CSV
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)


#Exercise 2

In [None]:
# Read Dataframe
train_df = pd.read_csv("train.csv", usecols= ['abstract', 'categories'])
test_df = pd.read_csv("test.csv", usecols= ['abstract', 'categories'])
train_df.head()


In [None]:
# info
train_df.info()

In [None]:
# Check the null values
train_df.isnull().sum()

In [None]:
# Check the null values
test_df.isnull().sum()

In [None]:
# drop the null values
train_df.dropna(axis = 0, how = 'any', inplace=True)
test_df.dropna(axis = 0, how = 'any', inplace=True)


In [None]:
train_df['categories'].value_counts()

In [None]:
# plot the values count 
train_df['categories'].value_counts().plot(kind='bar', figsize =(12,6))


In [None]:
# plot the values count 
test_df['categories'].value_counts().plot(kind='bar', figsize =(12,6))

In [None]:
# Load the dataset
data = train_df
# Get the text categories
text_categories = data.categories.unique()
# define the training set
train_data = train_df
# define the test set
test_data = test_df

print("We have {} unique classes".format(len(text_categories)))
print("We have {} training samples".format(len(train_data.abstract)))
print("We have {} test samples".format(len(test_data.abstract)))

What are the top 3 insights generated while doing the data analysis on train set (df_train)?

1. We have 26 unique classes

2. We have 43916 training samples

3. We have 10862 test samples

**What are the top 3 data challenges you observe on train set?**
1. Requirement of a large amount of training data

2. Requirement of powerful computing resources to perform analysis and advanced analytics

3. Required more training samples of minor categories like Skin and Connective Tissue Diseases, Disorders of Environmental Origin, Immune System Diseases and Musculoskeletal Diseases etc


#Exercise 3

a. Pick your favourite machine learning algorithm to train a multi-class text classifier using the train set (df_train). The classifier should be able to consider the abstract as input and predict any one of the 26 disease categories 

**Naive Bayes**

In [None]:
# Build the model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
# Train the model using the training data
model.fit(train_data.abstract, train_data.categories)
# Predict the categories of the test data
predicted_categories = model.predict(test_data.abstract)

# classification report
print(classification_report(test_data.categories, predicted_categories))


**b. Report key metrics on your test set (df_test) and explain your observations**

Naïve Bayes Model is showing a performance with 48% accuracy. For the categories like Male Urogenital Diseases showing a good precision- 0.86 and recall- 0.63 whereas the categories like  'Pathological Conditions and Signs and Symptoms', 'Respiratory Tract Diseases' , 'Skin and Connective Tissue Diseases' etc have the precision- 0.00 and recall- 0.00 due to availability of the skewed dataset.

**c. Explain the rationale behind choosing the algorithm in 3.a**

Naive Bayes classifiers are a collection of classification algorithms based on Bayes’ Theorem.

Naive Bayes assumes that each feature/variable of the same class makes an: independent and equal.

Bayes’ Theorem finds the probability of an event occurring given the probability of another event that has already occurred.

#Exercise 4

**a. Build a text classifier which classifies the abstracts into one of the 26 disease categories using any RNN based architecture and report key metrics on test set. Explain your observations**

In [None]:
##############    Recurrent Neural Network     #######################

class RNN():
  """
  A Recurrent Neural Network (RNN) is a type of artificial neural network 
  which uses sequential data.
  
  Args:
  embedding_dict : Embedding dictionary
  train_df : Train Dataframe name
  test_df : Test Dataframe name
  text_col : Text column name

  Returns: 
    model : GloVe-Contextualized Vectors with SimpleRNN model
  
  """
  def __init__(self, embedding_dict, train_df, test_df, text_col):
    """ Inits the Preprocessing """
    self.embedding_dict = embedding_dict
    self.df = train_df
    self.test_df = test_df
    self.text_col = text_col

  # clean text
  def clean_text(self, text):
    """Clean the text"""
    text = re.sub('[^a-zA-Z]', ' ', text)  
    text = text.lower()  
    text = text.split(' ')      
    text = [w for w in text if not w in set(stopwords.words('english'))] 
    text = ' '.join(text)            
    return text

  # create the corpus GloVe 
  def create_corpus(self, df,col_name):
      """ create the corpus GloVe """
      corpus=[]
      for abstract in tqdm(df[col_name]):
          words=[word.lower() for word in word_tokenize(abstract) if((word.isalpha()==1) & (word not in stop))]
          corpus.append(words)
      return corpus
  
  def run_all(self):
    """ Run all the methods as per the requirements """
    self.df['clean_abstract'] = self.df[self.text_col].apply(lambda x : self.clean_text(x))

    self.test_df['clean_abstract'] = self.test_df[self.text_col].apply(lambda x : self.clean_text(x))

    # padding
    MAX_LEN=10
    tokenizer_obj=Tokenizer()

    corpus=self.create_corpus(self.df, 'clean_abstract')
    tokenizer_obj.fit_on_texts(corpus)
    sequences=tokenizer_obj.texts_to_sequences(corpus)

    abstract_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

    word_index=tokenizer_obj.word_index

    # Embedding
    num_words=len(word_index)+1
    embedding_matrix=np.zeros((num_words,50))

    for word, i in tqdm(word_index.items()):
        if i > num_words:
            continue
        
        emb_vec=embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i]=emb_vec

    # Dataset split
    X_train,X_val, y_train, y_val = train_test_split(abstract_pad,self.df.categories, test_size=.2, random_state=2)

    # Create Model
    model=Sequential()

    embedding_layer=Embedding(num_words,50,embeddings_initializer=Constant(embedding_matrix),
                      input_length=MAX_LEN,trainable=False)

    model.add(embedding_layer)
    model.add(SpatialDropout1D(0.2))
    model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))
    model.add(tf.keras.layers.SimpleRNN(32,return_sequences=True))
    model.add(tf.keras.layers.SimpleRNN(16))
    model.add(tf.keras.layers.Dense(16, activation='relu'))

    model.add(Dense(1, activation='softmax'))

    optimzer=Adam(learning_rate=0.0001)
    model.compile(loss='categorical_crossentropy',optimizer=optimzer,metrics=['acc'])
    model.summary()

    #Fitting The Model
    history=model.fit(X_train,y_train,batch_size=32,epochs=10,validation_data=(X_val,y_val),verbose=1)
    
    # padding
    MAX_LEN=10
    tokenizer_obj=Tokenizer()

    corpus=self.create_corpus(self.test_df, 'clean_abstract')
    tokenizer_obj.fit_on_texts(corpus)
    sequences=tokenizer_obj.texts_to_sequences(corpus)

    abstract_pad_test = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

    word_index=tokenizer_obj.word_index

    y_predicted = model.predict(abstract_pad_test)

    print(classification_report(self.test_df.categories, y_predicted))
    
    return model



In [None]:
train_df = pd.read_csv("train.csv", usecols= ['abstract', 'categories'])
test_df = pd.read_csv("test.csv", usecols= ['abstract', 'categories'])

train_df.dropna(axis = 0, how = 'any', inplace=True)
test_df.dropna(axis = 0, how = 'any', inplace=True)

# Create the label collumn
train_df['categories'] = train_df['categories'].replace({
                               'Digestive System Diseases':0,
                               'Bacterial Infections and Mycoses':1,
                               'Wounds and Injuries':2,
                               'Animal Diseases':3,
                               'Male Urogenital Diseases':4,
                               'Endocrine System Diseases':5,
                               'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,
                               'Hemic and Lymphatic Diseases':7,
                               'Neoplasms':8,
                               'Chemically-Induced Disorders':9,
                               "Stomatognathic Diseases":10,
                               "Respiratory Tract Diseases": 11,
                               "Parasitic Diseases": 12,
                               "Eye Diseases": 13,
                               "Pathological Conditions and Signs and Symptoms": 14,
                               "Otorhinolaryngologic Diseases": 15,
                               "Nutritional and Metabolic Diseases": 16,
                               "Cardiovascular Diseases": 17,
                               "Female Urogenital Diseases and Pregnancy Complications": 18,
                               "Nervous System Diseases": 19,
                               "Virus Diseases": 20,
                               "Occupational Diseases": 21,
                               "Musculoskeletal Diseases": 22,
                               "Immune System Diseases": 23,
                               "Skin and Connective Tissue Diseases": 24,
                               "Disorders of Environmental Origin": 25
                               
                               })


test_df['categories'] = test_df['categories'].replace({
                               'Digestive System Diseases':0,
                               'Bacterial Infections and Mycoses':1,
                               'Wounds and Injuries':2,
                               'Animal Diseases':3,
                               'Male Urogenital Diseases':4,
                               'Endocrine System Diseases':5,
                               'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,
                               'Hemic and Lymphatic Diseases':7,
                               'Neoplasms':8,
                               'Chemically-Induced Disorders':9,
                               "Stomatognathic Diseases":10,
                               "Respiratory Tract Diseases": 11,
                               "Parasitic Diseases": 12,
                               "Eye Diseases": 13,
                               "Pathological Conditions and Signs and Symptoms": 14,
                               "Otorhinolaryngologic Diseases": 15,
                               "Nutritional and Metabolic Diseases": 16,
                               "Cardiovascular Diseases": 17,
                               "Female Urogenital Diseases and Pregnancy Complications": 18,
                               "Nervous System Diseases": 19,
                               "Virus Diseases": 20,
                               "Occupational Diseases": 21,
                               "Musculoskeletal Diseases": 22,
                               "Immune System Diseases": 23,
                               "Skin and Connective Tissue Diseases": 24,
                               "Disorders of Environmental Origin": 25                               
                               })


embedding_dict={}
with open('glove.6B.50d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

train_df = train_df.head(100)

RNNExe = RNN(embedding_dict, train_df, test_df, 'abstract')
model = RNNExe.run_all()

**Observations**

Above confusion matrix shows the RNN Model's performance due to the unavailability of powerful computing resources to perform advanced analytics its quite challenging. So selected the GloVe with 6B tokens, 400K vocab, uncased, 50d vectors 

Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): glove.840B.300d.zip will helps here to increase the accuracy with precision and recall.


**b. Build a text classifier which classifies the abstracts into one of the 26 disease categories using any Transformer architecture and report key metrics on test set. Explain your observations**

**BERT** - Bidirectional Encoder Representations from Transformers
The main breakthrough of the BERT model is that it scans text in a bidirectional way like left-to-right and right-to-left sequences when looking at texts during training.
There are two general types of BERT: BERT (base) and BERT (large). The difference is in configurable parameters: base-110 million parameter, large-345 million.


Note: BERT Base model is selected due to the restriction on the uses of computing resources (Colab)

In [None]:
### Install the required library
# ! pip install tensorflow_text

In [None]:
# import BERT model
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
import seaborn as sn

# import BERT model
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

#import ALBERT - A Lite BERT model
albert_preprocess = hub.KerasLayer("http://tfhub.dev/tensorflow/albert_en_preprocess/3")
albert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/albert_en_base/3")

#import RoBERTa - Robustly Optimized BERT Pretraining Approach
roberta_preprocess = hub.KerasLayer("https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_preprocess/1")
roberta_encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_L-12_H-768_A-12/1")


In [None]:
train_df = pd.read_csv("train.csv", usecols= ['abstract', 'categories'])
test_df = pd.read_csv("test.csv", usecols= ['abstract', 'categories'])

# drop the null values
train_df.dropna(axis = 0, how = 'any', inplace=True)
test_df.dropna(axis = 0, how = 'any', inplace=True)

In [None]:
# Create the label collumn
train_df['categories'] = train_df['categories'].replace({
                               'Digestive System Diseases':0,
                               'Bacterial Infections and Mycoses':1,
                               'Wounds and Injuries':2,
                               'Animal Diseases':3,
                               'Male Urogenital Diseases':4,
                               'Endocrine System Diseases':5,
                               'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,
                               'Hemic and Lymphatic Diseases':7,
                               'Neoplasms':8,
                               'Chemically-Induced Disorders':9,
                               "Stomatognathic Diseases":10,
                               "Respiratory Tract Diseases": 11,
                               "Parasitic Diseases": 12,
                               "Eye Diseases": 13,
                               "Pathological Conditions and Signs and Symptoms": 14,
                               "Otorhinolaryngologic Diseases": 15,
                               "Nutritional and Metabolic Diseases": 16,
                               "Cardiovascular Diseases": 17,
                               "Female Urogenital Diseases and Pregnancy Complications": 18,
                               "Nervous System Diseases": 19,
                               "Virus Diseases": 20,
                               "Occupational Diseases": 21,
                               "Musculoskeletal Diseases": 22,
                               "Immune System Diseases": 23,
                               "Skin and Connective Tissue Diseases": 24,
                               "Disorders of Environmental Origin": 25
                               
                               })


test_df['categories'] = test_df['categories'].replace({
                               'Digestive System Diseases':0,
                               'Bacterial Infections and Mycoses':1,
                               'Wounds and Injuries':2,
                               'Animal Diseases':3,
                               'Male Urogenital Diseases':4,
                               'Endocrine System Diseases':5,
                               'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,
                               'Hemic and Lymphatic Diseases':7,
                               'Neoplasms':8,
                               'Chemically-Induced Disorders':9,
                               "Stomatognathic Diseases":10,
                               "Respiratory Tract Diseases": 11,
                               "Parasitic Diseases": 12,
                               "Eye Diseases": 13,
                               "Pathological Conditions and Signs and Symptoms": 14,
                               "Otorhinolaryngologic Diseases": 15,
                               "Nutritional and Metabolic Diseases": 16,
                               "Cardiovascular Diseases": 17,
                               "Female Urogenital Diseases and Pregnancy Complications": 18,
                               "Nervous System Diseases": 19,
                               "Virus Diseases": 20,
                               "Occupational Diseases": 21,
                               "Musculoskeletal Diseases": 22,
                               "Immune System Diseases": 23,
                               "Skin and Connective Tissue Diseases": 24,
                               "Disorders of Environmental Origin": 25
                               
                               })

In [None]:
# Split it into training and test data set
# X_train, X_test, y_train, y_test = train_test_split(df_balanced[text_column], df_balanced[label_column], stratify=df_balanced[label_column]) # pylint: disable=invalid-name, disable=line-too-long

X_train = train_df['abstract']
y_train = train_df['categories']

X_test = test_df['abstract']
y_test = test_df['categories']

def create_model_func(X_train, y_train, X_test, y_test, bert_preprocess, bert_encoder, epochs):

  # Bert layers
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessed_text = bert_preprocess(text_input)
  outputs = bert_encoder(preprocessed_text)

  # Neural network layers
  lay = tf.keras.layers.Dense(64, activation='relu', name="dense1")(outputs['pooled_output'])
  lay = tf.keras.layers.Dropout(0.2, name="dropout1")(lay)
  lay = tf.keras.layers.Dense(32, activation='relu', name="dense2")(lay)
  lay = tf.keras.layers.Dropout(0.2, name="dropout")(lay)
  lay = tf.keras.layers.Dense(1, activation='softmax', name="output2")(lay)

  # Use inputs and outputs to construct a final model
  model = tf.keras.Model(inputs=[text_input], outputs=[lay])

  # print summary
  model.summary()
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  # train the model
  model.fit(X_train, y_train, epochs=epochs)

  y_predicted = model.predict(X_test)

  print(classification_report(y_test, y_predicted))
  


In [None]:
## BERT Base model
create_model_func(X_train, y_train, X_test, y_test, bert_preprocess, bert_encoder, 3)


**Observations**
The accueacy of the BERT model is 0.0952. Need more data to train the BERT model.


**c. You are free to experiment various Transformer architectures for 4.a and 4.b, however, only report the model which you consider the best. What is your rationale for this model selection?**

**ALBERT - A Lite BERT**
ALBERT is developed based on the BERT model. Its major breakthrough is that it brings a significant parameter reduction but maintains the same level of performance compared to BERT.
In ALBERT, parameters are shared across 12 layers of transformer encoders, while in the original BERT, each layer of encoders have a unique set of parameters.


**RoBERTa - Robustly Optimized BERT Pretraining Approach**
RoBERTa makes changes to the architecture and training procedures of BERT. Specifically, RoBERTa removes the Next Sentence Prediction (NSP) objective, uses a much larger dataset than BERT, and replaces static masking with dynamic masking.



In [None]:
#import ALBERT - A Lite BERT model 
create_model_func(X_train, y_train, X_test, y_test, albert_preprocess, albert_encoder, 3)


In [None]:
#import RoBERTa - Robustly Optimized BERT Pretraining Approach  
create_model_func(X_train, y_train, X_test, y_test, roberta_preprocess, roberta_encoder, 3)


d. Consider the test table/dataframe (df_test). Run an inference through the best model determined in 4.c.