Author: *Ren Yang*


### Objective 
- In this project, we aim to practice using python to implement some famous NLP models
to classify tweets that are about real natural disasters versus those that are not. Training
and testing datasets are provided by Kaggle.com. 

### Data and Methodology
- We use the data provided by Kaggle.com Competition: "Natural Language Processing with Disaster Tweets" https://www.kaggle.com/c/nlp-getting-started/data. The dataset contains text-based, unequal length tweets about natrual disasters. The training set consists of tweets and lablels( two classes 0,1, indicating if that tweet is about a real diaster or not).

- We choose 3 deep learning models for this natural language classfication problems:
 1. Single dense layer model; 
 2. Transfer learning: Single dense layer model with pretained text embedding layer; 
 3. Transfer learning: 1-dimensional CNN model with pretrained text embedding layer. 

For pretrain encoder, we choose the Universial Sentence Encoder (USE) as our pretrain text embedder. 

We compare the performance between these models in terms of training time, prediction accuracy, etc. 





### Main Program
In our main program, we created 3 classes:
1. Model_Servicing Class: this class aim to provide a range of "services" for a deep leaning model created from the TensorFlow Framework. Specifically, it wraps a user constructed model as its class attributes, and allows easy model compilation, training, making predictions, and returning evaluation matrics.

2. NLP_Model-Servicing Class: subclass of Model_Servicing. It inherents from the Model Servicing class, and is customized spacifically for Natrual Language Classification models. (For example, model evaluation in this class implements confusion matrix to produce a list of commonly used matrices for classfication problems; Model compilation utilized binary crossentropy loss function)

3. Dset Class: A class that helps cleaning and managing training and testing dataset.  

#### Main() function
- In main() function, we constructed 3 deep-learning models for disaster tweets classfications. A simple, single dense layer model; A single dense layer model with pretained text embedding layer; A 1-dimensional CNN model with pretrained text embedding layer. Then, we initilize 3 instances of the NPL_Model_Servicing class to take care of the compilation, training, and evaluation of the 3 models. 



In [None]:
from pyexpat import model
from sklearn import datasets

import tensorflow_hub as hub
import tensorflow as tf
import random
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers

physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass





class Model_servicing: # parent class to evaluate and save any deep learning model constructed by tf.keras
    def __init__(self,model, epoch) -> None:
        self.model=model
        self.epoch=epoch
    
    def train_Model(self,Dset):
        model_his = self.model.fit(Dset.train_features,
                              Dset.train_labels,
                              epochs=self.epoch,
                              validation_data=(Dset.val_features, Dset.val_labels))


    
    def evaluation(self,test_set,test_lable):
        return self.model.evaluate(test_set, test_lable)
    
    def prediction(self,input_set):
        return self.model.predict(input_set)
    
    def SaveModel(self,File_Name):
        self.model.save(File_Name)




class NLP_Model_servicing(Model_servicing): # Chiled Class of Model_servicing: model evaluation for this specific problem: NLP/classfication
    
    def __init__(self,model) -> None:
        self.model=model
    

    def NLP_prediction_helper(pred,true): # a function to generate evaluation matrices for a classification task
        # TP: True Positive %
        # TN: True negative %
        # FP: False Positive %
        # FN False negative %
        
        
        
        #model accuracy (TP+TN)/(TP+FP+FN+TN)
        model_pred_accuracy = accuracy_score(pred, true) * 100
        # model precision (TP/TP+FP)
        # model Recall   (TP/TP+FN)
        # model f1 score 2*(Recall * Precision) / (Recall + Precision)

        model_pred_precision, model_recall, model_f1,= precision_recall_fscore_support(pred, true, average="weighted")
        
        
        pred_results_dic = {"accuracy": model_pred_accuracy,
                  "precision": model_pred_precision,
                  "recall": model_recall,
                  "f1": model_f1}
        
        return pred_results_dic


    def NLP_prediction(self,input_feature,true_set):
        pred=tf.squeeze(tf.round(Model_servicing.prediction(self,input_set=input_feature)))
        
        # TP: True Positive %
        # TN: True negative %
        # FP: False Positive %
        # FN False negative %
        
        
        
        #model accuracy (TP+TN)/(TP+FP+FN+TN)
        model_pred_accuracy = accuracy_score(true_set,pred) * 100
        # model precision (TP/TP+FP)
        # model Recall   (TP/TP+FN)
        # model f1 score 2*(Recall * Precision) / (Recall + Precision)

        model_pred_precision, model_recall, model_f1,_=precision_recall_fscore_support(true_set,pred,average="weighted")
        
        
        pred_results_dic = {"accuracy": model_pred_accuracy,
                  "precision": model_pred_precision,
                  "recall": model_recall,
                  "f1": model_f1}
        
        return pred_results_dic
        
        
        

    
    def NLP_Compile(self):
        self.model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])



class Dset:
    def __init__(self,path,seed) -> None:
        self.seed=seed
        self.dataset=pd.read_csv(path)
        self.dataset_shuffled=self.dataset.sample(frac=1, random_state=self.seed)
        
    def Train_Test_split(self,test_size):   
        
        self.train_features, self.val_features, self.train_labels, self.val_labels = train_test_split(self.dataset_shuffled["text"].to_numpy(),
                                                                            self.dataset_shuffled["target"].to_numpy(),
                                                                            test_size=test_size,   # dedicate 10% of samples to validation set
                                                                            random_state=self.seed) # random state for reproducibility

    

def Main():

    #------------------------------------------------------------------Load Data---------------------------------------------------------------------------------------------

    D1=Dset('train.csv',22)
    D1.Train_Test_split(test_size=0.1)

    print('Dataset Created! ')


    #---------------------------------------------------------------Model 1 Single Dense Layer Model-------------------------------------------------------------------------
    
    tf.random.set_seed(42)

    embedding = tf.keras.layers.Embedding(input_dim=50000, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=15, # how long is each input
                             name="embedding_1") 
    

    max_vacab_len=50000
    max_out_len=50



    text_vectorizer = TextVectorization(max_tokens=max_vacab_len, 
                                    standardize="lower_and_strip_punctuation", 
                                    split="whitespace",
                                    ngrams=None, 
                                    output_mode="int", 
                                    output_sequence_length=max_out_len) 
    
    
    text_vectorizer.adapt(D1.train_features)
    
    
    
    
    inputs = layers.Input(shape=(1,), dtype="string") 
    x = text_vectorizer(inputs) 
    x = embedding(x) 
    x = layers.GlobalAveragePooling1D()(x) 
    outputs = layers.Dense(1, activation="sigmoid")(x) 
    model_1 = tf.keras.Model(inputs, outputs, name="model_1")

    
    
    M1=NLP_Model_servicing(model_1)

    
    
    M1.epoch=3

    

    M1.NLP_Compile()

   

    M1.train_Model(D1)
    
    print("Model 1 trained!")


    M1_result=M1.NLP_prediction(D1.val_features,D1.val_labels)
    
    print("Model 1 Completed")

#-----------------------------------------------------------------------Model 2 Simple Dense + Pretained Universal Sentence Encoder-------------------------------------------------------
    
    sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE") 
    

    model_2 = tf.keras.Sequential([
       sentence_encoder_layer, # take in sentences and then encode them into an embedding
       tf.keras.layers.Dense(64, activation="relu"),
       tf.keras.layers.Dense(1, activation="sigmoid")
                             ], name="model_2")
    
    
    
    M2=NLP_Model_servicing(model_2)

    
    
    M2.epoch=3

    

    M2.NLP_Compile()

   

    M2.train_Model(D1)
    
    print("Model 2 trained!")


    M2_result=M2.NLP_prediction(D1.val_features,D1.val_labels)
    
    print("Model 2 Completed")


#-----------------------------------------------------------------------Model 3: 1-D CNN Model + Pretained Universal Sentence Encoder-------------------------------------------------------

    model_3 = tf.keras.Sequential([
              sentence_encoder_layer, # take in sentences and then encode them into an embedding
              embedding,
              tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation="relu"),
              tf.keras.layers.GlobalMaxPool1D(),
              tf.keras.layers.Dense(1, activation="sigmoid")
              ], name="model_3")

    M3=NLP_Model_servicing(model_3)

    
    
    M3.epoch=3

    

    M3.NLP_Compile()

   

    M3.train_Model(D1)
    
    print("Model 3 trained!")


    M3_result=M3.NLP_prediction(D1.val_features,D1.val_labels)
    
    print("Model 3 Completed")






    print('Model 1 Results:')
    print(M1_result)

    print('Model 2 Results:')
    print(M2_result)

    print('Model 3 Results:')
    print(M3_result)




Main()








print('Ni Hao')


Dataset Created! 
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 1 trained!
Model 1 Completed
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 2 trained!
Model 2 Completed
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model 3 trained!
Model 3 Completed
Model 1 Results:
{'accuracy': 83.46456692913385, 'precision': 0.8361314027820705, 'recall': 0.8346456692913385, 'f1': 0.8327252921312324}
Model 2 Results:
{'accuracy': 82.02099737532808, 'precision': 0.8196477317588005, 'recall': 0.8202099737532809, 'f1': 0.8195109002952334}
Model 3 Results:
{'accuracy': 57.3490813648294, 'precision': 0.32889171333898226, 'recall': 0.573490813648294, 'f1': 0.4180408433099324}
Ni Hao


  _warn_prf(average, modifier, msg_start, len(result))
