# ML Pipeline Preparation

### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [12]:
# download necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet','stopwords'])
import pickle
import string
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression



[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql('SELECT * FROM DisasterResponse', engine)
X = df['message']
y = df.iloc[:,4:]

In [3]:
df

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26210,30261,The training demonstrated how to enhance micro...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26211,30262,A suitable candidate has been selected and OCH...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,30263,"Proshika, operating in Cox's Bazar municipalit...",,news,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,30264,"Some 2,000 women protesting against the conduc...",,news,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process text data

In [4]:
list(df['message'][:5])

['Weather update - a cold front from Cuba that could pass over Haiti',
 'Is the Hurricane over or is it not over',
 'Looking for someone but no name',
 'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
 'says: west side of Haiti, rest of the country today and tonight']

In [5]:
def tokenize(text):    

    # normalize and tokenize text
    tokens = word_tokenize(text.lower())
    # remove stopwords
    tokens = [w for w in tokens if w not in stopwords.words("english") and w not in string.punctuation]
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok, pos='v').strip()

        clean_tokens.append(clean_tok)

    return clean_tokens

In [6]:
for message in X[:5]:
    tokens = tokenize(message)
    print(tokens,'\n')

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pass', 'haiti'] 

['hurricane'] 

['look', 'someone', 'name'] 

['un', 'report', 'leogane', '80-90', 'destroy', 'hospital', 'st.', 'croix', 'function', 'need', 'supply', 'desperately'] 

['say', 'west', 'side', 'haiti', 'rest', 'country', 'today', 'tonight'] 



### 3. Build a machine learning pipeline
This machine pipeline takes in the `message` column as input and output classification results on the other 36 categories in the dataset.

In [7]:
def ML_pipeline_1(clf = RandomForestClassifier()):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
        ('clf', MultiOutputClassifier(clf))
        ])
    return pipeline


### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [8]:
# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=123)

In [9]:
y_train.shape

(20972, 35)

In [10]:
y_test.shape

(5243, 35)

In [11]:
# train classifier
model_RF1 = ML_pipeline_1()
model_RF1.fit(X_train, y_train)




### 5. Test the model
Report the f1 score, precision and recall for each output category of the dataset.

In [52]:
# predict on test data
y_pred_RF1=model_RF1.predict(X_test)


In [53]:
print(classification_report(y_test.values, y_pred_RF1, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      5026
               request       0.85      0.50      0.63      1088
                 offer       0.00      0.00      0.00        26
           aid_related       0.78      0.69      0.73      2720
          medical_help       0.60      0.07      0.12       523
      medical_products       0.79      0.07      0.13       328
     search_and_rescue       0.78      0.09      0.16       159
              security       0.33      0.01      0.02       117
              military       0.84      0.07      0.13       220
                 water       0.91      0.31      0.46       417
                  food       0.90      0.52      0.66       731
               shelter       0.83      0.36      0.50       574
              clothing       0.75      0.09      0.16        99
                 money       0.80      0.06      0.10       144
        missing_people       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Improve the model: Try other machine learning algorithms
We want to minimize False Negative, i.e. maximize recall = TP / (TP + FN).

In [13]:
# train XGBoost()
model_XGB1 = ML_pipeline_1(clf =  XGBClassifier())
model_XGB1.fit(X_train, y_train)



In [14]:
# predict on test data
y_pred_XGB1 = model_XGB1.predict(X_test)
print(classification_report(y_test.values, y_pred_XGB1, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.83      0.96      0.89      4017
               request       0.82      0.56      0.66       896
                 offer       0.50      0.05      0.09        20
           aid_related       0.77      0.65      0.70      2160
          medical_help       0.56      0.25      0.35       391
      medical_products       0.65      0.32      0.43       258
     search_and_rescue       0.76      0.29      0.42       145
              security       0.70      0.07      0.13        97
              military       0.58      0.36      0.45       187
                 water       0.78      0.67      0.72       345
                  food       0.81      0.74      0.77       587
               shelter       0.75      0.59      0.66       458
              clothing       0.84      0.50      0.63        82
                 money       0.65      0.25      0.36        96
        missing_people       0.43      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# train Logistic regression Classifier
model_LR1 = ML_pipeline_1(clf = LogisticRegression())
model_LR1.fit(X_train, y_train)



In [16]:
# predict on test data
y_pred_LR1 = model_LR1.predict(X_test)
print(classification_report(y_test.values, y_pred_LR1, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.83      0.95      0.89      4017
               request       0.82      0.55      0.66       896
                 offer       0.00      0.00      0.00        20
           aid_related       0.75      0.68      0.71      2160
          medical_help       0.62      0.14      0.23       391
      medical_products       0.71      0.16      0.26       258
     search_and_rescue       0.94      0.10      0.19       145
              security       0.00      0.00      0.00        97
              military       0.68      0.14      0.23       187
                 water       0.78      0.51      0.62       345
                  food       0.88      0.59      0.70       587
               shelter       0.80      0.45      0.57       458
              clothing       0.86      0.29      0.44        82
                 money       0.64      0.09      0.16        96
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 7. Improve XGBoost model: Add other features besides the TF-IDF

In [19]:
# This custom transformer counts the number of words in each message
class WordCountTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        word_count = []
        for message in X:
            words = message.split()
            word_count.append(len(words))
        return pd.DataFrame(word_count)
    

In [20]:
# This custom transformer returns True if the starting word is verb
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(word_tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [21]:
# Add custom transformers to the pipeline
def ML_pipeline_2(clf = XGBClassifier()):
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
            ('wordCount', WordCountTransformer())            
        ])),        
        ('clf', MultiOutputClassifier(clf))
        ])
    return pipeline


def ML_pipeline_3(clf = XGBClassifier()):
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
            ('verb', StartingVerbExtractor())            
        ])),        
        ('clf', MultiOutputClassifier(clf))
        ])
    return pipeline

In [22]:
# train XGBoost() using ML_pipeline_2 
model_XGB2 = ML_pipeline_2()
model_XGB2.fit(X_train, y_train)



In [23]:
# predict on test data
y_pred_XGB2 = model_XGB2.predict(X_test)
print(classification_report(y_test.values, y_pred_XGB2, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.83      0.95      0.89      4017
               request       0.81      0.56      0.66       896
                 offer       0.00      0.00      0.00        20
           aid_related       0.76      0.65      0.70      2160
          medical_help       0.56      0.25      0.34       391
      medical_products       0.64      0.31      0.42       258
     search_and_rescue       0.71      0.27      0.39       145
              security       0.50      0.04      0.08        97
              military       0.60      0.34      0.43       187
                 water       0.75      0.67      0.71       345
                  food       0.82      0.74      0.78       587
               shelter       0.78      0.60      0.68       458
              clothing       0.80      0.50      0.62        82
                 money       0.61      0.24      0.34        96
        missing_people       0.50      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# train XGBoost() using ML_pipeline_3
model_XGB3 = ML_pipeline_3()
model_XGB3.fit(X_train, y_train)



In [25]:
# predict on test data
y_pred_XGB3 = model_XGB3.predict(X_test)
print(classification_report(y_test.values, y_pred_XGB3, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.83      0.95      0.89      4017
               request       0.82      0.56      0.66       896
                 offer       0.50      0.05      0.09        20
           aid_related       0.77      0.65      0.70      2160
          medical_help       0.56      0.25      0.35       391
      medical_products       0.65      0.32      0.43       258
     search_and_rescue       0.75      0.28      0.41       145
              security       0.70      0.07      0.13        97
              military       0.60      0.38      0.47       187
                 water       0.78      0.67      0.72       345
                  food       0.81      0.73      0.77       587
               shelter       0.78      0.59      0.67       458
              clothing       0.81      0.48      0.60        82
                 money       0.65      0.25      0.36        96
        missing_people       0.43      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 8. Improve the model: Use grid search to find better parameters

In [26]:
model_XGB1.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(tokenizer=<function tokenize at 0x000002284CB6F700>)),
  ('clf',
   MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                                 callbacks=None,
                                                 colsample_bylevel=None,
                                                 colsample_bynode=None,
                                                 colsample_bytree=None,
                                                 early_stopping_rounds=None,
                                                 enable_categorical=False,
                                                 eval_metric=None,
                                                 feature_types=None, gamma=None,
                                                 gpu_id=None, grow_policy=None,
                                                 importance_type=None,
                                                 interaction_cons

In [27]:
# Set parameters for grid search
parameters = {
    'clf__estimator__n_estimators': [50, 100],
    'clf__estimator__learning_rate': [0.1, 0.5],
}

model_XGB_CV = GridSearchCV(model_XGB1, param_grid=parameters, cv=3)
model_XGB_CV.fit(X_train, y_train)




In [28]:
model_XGB_CV.best_params_

{'clf__estimator__learning_rate': 0.5, 'clf__estimator__n_estimators': 100}

In [29]:
y_pred_XGB_CV = model_XGB_CV.predict(X_test)
print(classification_report(y_test.values, y_pred_XGB_CV, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.84      0.93      0.89      4017
               request       0.78      0.57      0.66       896
                 offer       0.25      0.05      0.08        20
           aid_related       0.75      0.67      0.71      2160
          medical_help       0.53      0.27      0.36       391
      medical_products       0.64      0.30      0.41       258
     search_and_rescue       0.69      0.28      0.39       145
              security       0.42      0.05      0.09        97
              military       0.56      0.34      0.42       187
                 water       0.77      0.66      0.71       345
                  food       0.81      0.72      0.76       587
               shelter       0.75      0.61      0.67       458
              clothing       0.81      0.48      0.60        82
                 money       0.57      0.27      0.37        96
        missing_people       0.44      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 9. Export the selected model as a pickle file

In [30]:
with open('classifier.pkl', 'wb') as f:
    pickle.dump(model_XGB_CV, f)
