# ML Pipeline Preparation

### Import libraries

In [1]:
# general libraries
import os
from pathlib import Path
import re
from time import time
import pickle
import joblib

# linear algebra and numerical libraries
import numpy as np
import pandas as pd

# database manipulation package
from sqlalchemy import create_engine 

# nlp packages
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# data processing packages
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer
)

from sklearn.base import (
    BaseEstimator, 
    TransformerMixin,
)

# machine learning packages
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier
)

from sklearn.multioutput import MultiOutputClassifier
from skmultilearn.problem_transform import ClassifierChain


from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
    GridSearchCV
)

# metrics evaluation packages
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    make_scorer
)

# to estimate class weights for unbalanced classes
from sklearn.utils.class_weight import compute_sample_weight

### Load the dataset and separate features and labels

In [2]:
# create a variable for the project's directory
path = Path('../').resolve()

In [3]:
# path for the database
disaster_db = path / 'data/DisasterResponse.db'
# create engine to retrieve the dataset
engine=create_engine(f'sqlite:///{disaster_db}')

In [4]:
# load data from database
df = pd.read_sql('SELECT * FROM MessagesTable', engine)

In [5]:
df.shape

(25922, 37)

In [6]:
# list the categories
categories = df.columns[2:]

In [7]:
# separate the features from the target variables
X= df['message']
Y = df[df.columns[2:]]

#### The alternate dataset (with category related removed)

In [13]:
# path for the alternate database
disaster_opt = path / 'data/DisasterResponseOpt.db'
# create engine to retrieve the dataset
engine_opt=create_engine(f'sqlite:///{disaster_opt}')
# load data from database
df_opt = pd.read_sql('SELECT * FROM DisasterResponseOpt', engine_opt)

In [14]:
# categories in the alternate database
categories_opt=df_opt.columns[2:]

In [15]:
# separate the features from the target variables
X_opt= df_opt['message']
Y_opt = df_opt[df_opt.columns[2:]]

### Write a tokenization function to process the text data

In [8]:
def tokenize(text):
    
    """
    Contains the pre-processing steps for a message:
        - replaces url links, emails, ip addresses with placeholders
        - removes punctuation, unusual characters
        - tokenize, 
        - lemmatize
        - lowercasing
        - removes stopwords in English language
        
    INPUT: string, raw message
    OUTPUT: list of clean tokens
    """
    
    # use regular expressions to detect a url, an email, an ip address
    url_string = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    email_string = '[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+'
    ip_string = '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})'
    
    # get the list of all urls, emails, ips using regex
    detected_urls = re.findall(url_string, text)
    detected_emails = re.findall(email_string, text)
    detected_ips = re.findall(ip_string, text)
    
    # replace each url in text string with placeholder
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')
        
    # replace each email in text string with placeholder
    for email in detected_emails:
        text = text.replace(email, 'emailplaceholder')
    
    # replace each ip address in text string with placeholder
    for ip in detected_ips:
        text = text.replace(ip, 'ipplaceholder')
    
    # create a list of placeholders
    placeholders = ['urlplaceholder', 'emailplaceholder', 'ipplaceholder']
    
    # remove punctuation and unusual characters 
    text = re.sub(r"[^a-zA-Z0-9]", " ", text).strip()
    
    # split into words
    words = word_tokenize(text)
    
    # lemmatize - reduce words to their root form
    words = [WordNetLemmatizer().lemmatize(w) for w in words]
    
    # case normalize and remove leading & trailing empty spaces
    words = [w.lower().strip() for w in words]
    
    # remove stopwords 
    clean_words = [w for w in words if (( w not in stopwords.words('english')) and
                                         (w not in placeholders) and (len(w)>2)) or
                   (w in ['not', 'can'])]
    
    
    return clean_words

In [9]:
# test tokenize function
test = X[2233]
tokenize(test)

print(test)
print(" ")
print(tokenize(test))

.. . event discussed national. People are not getting aid. Many people have already died because of hunger, thirst, psychological health problems, stress, espcially little. .. 
 
['event', 'discussed', 'national', 'people', 'not', 'getting', 'aid', 'many', 'people', 'already', 'died', 'hunger', 'thirst', 'psychological', 'health', 'problem', 'stress', 'espcially', 'little']


<div class="alert alert-block alert-warning"> <b>NOTES:</b>
    <br>
The tokenizer function included in the Vectorizer, in the pipeline is nor properly recovered after unpickling. The python app runs locally but it does not run on Heroku. We get the following:
    
    
    AttributeError: module '__main__' has no attribute 'tokenize'  
    

There are several methods to address this issue. One attempt to solve this issue is to create a custom Tokenizer class, as below.
<br>
For more details see:
    <li>https://stackoverflow.com/questions/53936736/my-flask-app-does-not-run-on-heroku-but-it-runs-locally</li>
    <li>https://stackoverflow.com/questions/49483732/why-does-my-flask-app-work-when-executing-using-python-app-py-but-not-when-usi</li>
</div>

In [10]:
# custom tokenizer class to use in the pipelines

class Tokenizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def tokenize(text):
            """
            Contains the pre-processing steps for a message:
            - replaces url links with placeholders
            - removes punctuation, unusual characters
            - tokenize, 
            - lemmatize
            - lowercasing
            - removes stopwords in English language
            INPUT: string, raw message
            OUTPUT: list of clean tokens
            """
            # use regular expressions to detect an url
            url_string = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    
            # get the list of all urls, emails using regex
            detected_urls = re.findall(url_string, text)

            # replace each url in text string with placeholder
            for url in detected_urls:
                text = text.replace(url, 'urlplaceholder')
        
            # remove punctuation and unusual characters 
            text = re.sub(r"[^a-zA-Z0-9]", " ", text).strip()
    
            # split into words
            words = word_tokenize(text)
    
            # lemmatize - reduce words to their root form
            words = [WordNetLemmatizer().lemmatize(w) for w in words]
    
            # case normalize and remove leading & trailing empty spaces
            words = [w.lower().strip() for w in words]
    
            # remove stopwords 
            clean_words = [w for w in words if w not in stopwords.words('english')]
                           

            return ' '.join(clean_words)

        return pd.Series(X).apply(tokenize).values


### Sample the data

In [11]:
# split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [16]:
# split the alternate data
X_opt_train, X_opt_test, Y_opt_train, Y_opt_test = train_test_split(X_opt, Y_opt, 
                                                    test_size=0.30, 
                                                    random_state=42)

### Build a machine learning pipeline for a simple model

The simple machine learning pipeline takes in the `message` column as input and outputs classification results on the 36 categories in the dataset. We are treating the problem as supervised multi-label classification.

In [43]:
# simple model pipeline
pipe_simple  = Pipeline([
    ('tokenize', Tokenizer()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])

### Train the simple model 

In [44]:
# train the pipeline on the train set
simple_model = pipe_simple.fit(X_train, Y_train)

### Save the simple model in a file

In [20]:
# export the simple model as a pickle file
#with open(path / 'models/simple_model.pkl', 'wb') as f:
#    pickle.dump(simple_model, f)

### Evaluate the simple  model

We evaluate the f1 score, precision and recall for each output category of the dataset. We will print these values in two different formats. 

In [18]:
def report_to_dataframe(y_true, y_pred, categories):
    """
    Function to save the sklearn classification report as a
    pandas dataframe.
    INPUT: 
        y_true (pd.Dataframe) - the true labels 
        y_pred (np.array) - the predicted labels
        categories (list) - the list of predicted labels 
    OUTPUT: 
        reports (list) - a list of two dataframes

    where:
        reports[0] (pd.DataFrame) - contains precision, recall, f1-score, accuracy 
                                    for each label
        reports[1] (pd.DataFrame) - contains overall averages of the scores of precision, 
                                    recall, f1-score, accuracy.
    """
    # save classification report in dictionary form
    report_dict = classification_report(y_true, y_pred, 
                              target_names=categories,
                                    zero_division=0,
                              output_dict=True)
    # save the report as a datafarme
    report_df = pd.DataFrame.from_dict(report_dict).T.round(2)
    # drop the last 4 rows as they represent averages
    report = report_df[:-4]
    # save the averages in a separate dataframe
    report_avg = report_df[-4:]
    
    # convert the index into a column, rename it to Category
    report=report.reset_index().rename({'index':'category'},
                                       axis = 'columns')
     # add the individual labels accuracies
    accuracies=[]
    for i in range(len(categories)):
        accuracies.append(accuracy_score(y_true.iloc[:, i].values, 
                                         y_pred[:, i]))
    accuracies = pd.Series(accuracies).round(2)
    
    report.insert(4, 'accuracy', accuracies)
    
    # drop the support column
    #report.drop(columns=['support'], inplace=True)
    
    # create a tuple of reports
    reports = [report, report_avg]
    return reports

In [45]:
# evaluate all steps on the test set
Y_pred_simple = simple_model.predict(X_test)

In [46]:
# detailed report for the simple model, order by F1-score
simple_model_report = report_to_dataframe(Y_test, Y_pred_simple, categories)[0]
simple_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
0,related,0.86,0.92,0.89,0.82,5899.0
31,earthquake,0.91,0.66,0.76,0.96,741.0
27,weather_related,0.89,0.6,0.71,0.87,2179.0
3,aid_related,0.8,0.55,0.65,0.76,3205.0
1,request,0.81,0.49,0.61,0.89,1301.0
10,food,0.84,0.43,0.57,0.93,854.0
34,direct_report,0.76,0.35,0.48,0.85,1497.0
28,floods,0.91,0.3,0.45,0.94,644.0
29,storm,0.77,0.32,0.45,0.93,719.0
9,water,0.87,0.28,0.42,0.95,484.0


In [24]:
# create a list of categories ordered by their support
sorted_categories = list(simple_model_report.sort_values(by='support', ascending=False).category)
print(sorted_categories)

['related', 'aid_related', 'weather_related', 'direct_report', 'request', 'other_aid', 'food', 'earthquake', 'storm', 'shelter', 'floods', 'medical_help', 'water', 'infrastructure_related', 'other_weather', 'buildings', 'medical_products', 'transport', 'death', 'other_infrastructure', 'military', 'refugees', 'search_and_rescue', 'money', 'electricity', 'cold', 'security', 'clothing', 'aid_centers', 'fire', 'missing_people', 'hospitals', 'tools', 'offer', 'shops']


In [47]:
# overall report for the simple model
simple_model_avg_report = report_to_dataframe(Y_test, Y_pred_simple, categories)[1]
simple_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.84,0.47,0.6,24315.0
macro avg,0.57,0.16,0.21,24315.0
weighted avg,0.76,0.47,0.53,24315.0
samples avg,0.65,0.42,0.47,24315.0


<div class="alert alert-block alert-info"> <b>NOTES:</b>
    <br>
A macro-average computes the metric independently for each class and then takes the average hence treating all classes equally, whereas a micro-average aggregates the contributions of all classes to compute the average metric. With the large classes performing better than the small ones, we expect to see the micro average being higher than the macro average.
</div>

In [26]:
# we can also print the class individual classification report
for i in range(len(categories)):
    print(i, Y_test.columns[i])
    print(classification_report(Y_test.iloc[:, i].values, Y_pred_simple[:, i],
                               zero_division=0))

0 related
              precision    recall  f1-score   support

           0       0.69      0.54      0.61      1849
           1       0.87      0.92      0.89      5974

    accuracy                           0.83      7823
   macro avg       0.78      0.73      0.75      7823
weighted avg       0.82      0.83      0.83      7823

1 request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      6493
           1       0.84      0.47      0.60      1330

    accuracy                           0.90      7823
   macro avg       0.87      0.73      0.77      7823
weighted avg       0.89      0.90      0.88      7823

2 offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7776
           1       1.00      0.04      0.08        47

    accuracy                           0.99      7823
   macro avg       1.00      0.52      0.54      7823
weighted avg       0.99      0.99      0.99   

### Improve the simple model

Use randomized grid search to find better parameters and tune the model.

In [27]:
# first access the parameter keys of the individual estimators
print(pipe_simple.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'tokenize', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__bootstrap', 'clf__ccp_alpha', 'clf__class_weight', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__max_samples', 'clf__min_impurity_decrease', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])


In [28]:
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters_tuned = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': [5, 10],
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__smooth_idf': [True, False],
    'tfidf__norm': ('l1', 'l2'),
    'clf__n_estimators': [50, 100, 150], 
    'clf__min_samples_split': [2, 4, 6],
}

scorer = make_scorer(f1_score, average='micro')

# create the grid search pipeline for the specified parameters
grid= RandomizedSearchCV(pipe_simple,
                            param_distributions=parameters_tuned,
                            n_jobs=1, # warning: does not work with -1
                            verbose=4, 
                            scoring=scorer,
                            cv=3)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe_simple.steps])
print("parameters:")
print(parameters_tuned)

t0 = time()
# fit the model
grid.fit(X_train, Y_train)

print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid.best_score_)
print("Best parameters set:")
best_parameters_tuned = grid.best_estimator_.get_params()

for param_name in sorted(parameters_tuned.keys()):
    print("\t%s: %r" % (param_name, best_parameters_tuned[param_name]))

Performing grid search...
pipeline: ['tokenize', 'vect', 'tfidf', 'clf']
parameters:
{'vect__min_df': [5, 10], 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__norm': ('l1', 'l2'), 'clf__n_estimators': [50, 100, 150], 'clf__min_samples_split': [2, 4, 6]}
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END clf__min_samples_split=2, clf__n_estimators=50, tfidf__norm=l2, vect__max_features=50000, vect__min_df=10, vect__ngram_range=(1, 2);, score=0.613 total time=  43.8s
[CV 2/3] END clf__min_samples_split=2, clf__n_estimators=50, tfidf__norm=l2, vect__max_features=50000, vect__min_df=10, vect__ngram_range=(1, 2);, score=0.607 total time=  44.2s
[CV 3/3] END clf__min_samples_split=2, clf__n_estimators=50, tfidf__norm=l2, vect__max_features=50000, vect__min_df=10, vect__ngram_range=(1, 2);, score=0.612 total time=  43.9s
[CV 1/3] END clf__min_samples_split=2, clf__n_estimators=150, tfidf__norm=l1, vect__max_features=50000

### Test your model

Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [30]:
# use saved best parameters from grid search
best_parameters = {
    'clf__min_samples_split': 6,
    'clf__n_estimators': 150,
    'tfidf__norm': 'l1',
    'vect__max_features': 5000,
    'vect__min_df': 10,
    'vect__ngram_range': (1, 1),
}

In [32]:
# set the parameters
# pipe_tuned = pipe_simple.set_params(**best_parameters)
# alternate construction of pipeline
pipe_tuned = grid.best_estimator_

# fit the tuned simple model
tuned_model = pipe_tuned.fit(X_train, Y_train)

In [33]:
# export the simple tuned model as a pickle file
#with open(path / 'models/tuned_model.pkl', 'wb') as f:
#    pickle.dump(tuned_model, f)

In [34]:
# evaluate all steps on the test set
Y_pred_tuned = tuned_model.predict(X_test)

In [35]:
# get the classification report as a dataframe
tuned_model_report = report_to_dataframe(Y_test, Y_pred_tuned, categories)[0] 
tuned_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
0,related,0.86,0.94,0.89,0.83,5974.0
31,earthquake,0.92,0.73,0.81,0.97,739.0
27,weather_related,0.87,0.64,0.74,0.87,2180.0
10,food,0.85,0.56,0.68,0.94,899.0
3,aid_related,0.81,0.56,0.66,0.76,3276.0
1,request,0.82,0.49,0.61,0.9,1330.0
29,storm,0.76,0.47,0.58,0.94,705.0
28,floods,0.9,0.42,0.57,0.95,648.0
9,water,0.86,0.39,0.53,0.96,516.0
34,direct_report,0.78,0.36,0.5,0.86,1539.0


In [36]:
# averages the simple improved model
tuned_model_avg_report = report_to_dataframe(Y_test, Y_pred_tuned, categories)[1] 
tuned_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.84,0.5,0.63,24887.0
macro avg,0.63,0.19,0.25,24887.0
weighted avg,0.76,0.5,0.55,24887.0
samples avg,0.67,0.45,0.49,24887.0


In [37]:
# we can also print the individual classes classification reports
for i in range(len(categories)):
    print(i, Y_test.columns[i])
    print(classification_report(Y_test.iloc[:, i].values, Y_pred_tuned[:, i],
                               zero_division=0))

0 related
              precision    recall  f1-score   support

           0       0.71      0.49      0.58      1849
           1       0.86      0.94      0.89      5974

    accuracy                           0.83      7823
   macro avg       0.78      0.71      0.74      7823
weighted avg       0.82      0.83      0.82      7823

1 request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      6493
           1       0.82      0.49      0.61      1330

    accuracy                           0.90      7823
   macro avg       0.86      0.73      0.78      7823
weighted avg       0.89      0.90      0.88      7823

2 offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7776
           1       1.00      0.04      0.08        47

    accuracy                           0.99      7823
   macro avg       1.00      0.52      0.54      7823
weighted avg       0.99      0.99      0.99   

#### Compare simple model and tuned model

Take a look at the f1-scores for example.

In [38]:
print('Simple model scores:') 
print(simple_model_avg_report['f1-score'])
print('')
print('Tuned simple model scores:') 
print(tuned_model_avg_report['f1-score'])

Simple model scores:
micro avg       0.60
macro avg       0.21
weighted avg    0.52
samples avg     0.47
Name: f1-score, dtype: float64

Tuned simple model scores:
micro avg       0.63
macro avg       0.25
weighted avg    0.55
samples avg     0.49
Name: f1-score, dtype: float64


<div class="alert alert-block alert-info"> <b>NOTES:</b>
    <br>

The tuned model is just slightly better than the simple model. This indicates that the tunning did not significantly improve the model. Even for the most represented classes the tuned model does not perform any much better. The RandomForestClassifier works pretty well with the default settings. 
    
The main issue here is the highly imbalanced dataset, with about a dozen of classes having zero scores for all three metrics. 
</div>

#### The tuned simple model on the alternate dataset

In [40]:
# check the model on the alternate database

# fit the model on the alternate dataset
tuned_model_opt = pipe_tuned.fit(X_opt_train, Y_opt_train)

# estimate on test set
Y_opt_pred_tuned = tuned_model_opt.predict(X_opt_test)

# get the classification report as a dataframe
tuned_model_opt_report = report_to_dataframe(Y_opt_test, Y_opt_pred_tuned, categories_opt)[0] 
tuned_model_opt_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
2,aid_related,0.88,0.89,0.88,0.83,3299.0
30,earthquake,0.97,0.79,0.87,0.96,746.0
26,weather_related,0.94,0.78,0.85,0.86,2200.0
9,food,0.94,0.78,0.85,0.95,863.0
0,request,0.85,0.71,0.77,0.87,1359.0
8,water,0.85,0.69,0.76,0.95,482.0
28,storm,0.84,0.62,0.71,0.92,713.0
33,direct_report,0.76,0.64,0.7,0.81,1524.0
27,floods,0.94,0.51,0.66,0.92,652.0
10,shelter,0.9,0.45,0.6,0.91,669.0


In [41]:
# averages the simple improved model and alternate dataset
tuned_model_opt_avg_report = report_to_dataframe(Y_opt_test, Y_opt_pred_tuned, categories_opt)[1] 
tuned_model_opt_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.87,0.54,0.66,19011.0
macro avg,0.72,0.27,0.34,19011.0
weighted avg,0.82,0.54,0.6,19011.0
samples avg,0.86,0.6,0.67,19011.0


### Try improving your model further

#### Use RandomForestClassifier with class weights

In [42]:
# add class weights to RFC in the simple pipeline

# simple model pipeline with class weights
pipe_weighted  = Pipeline([
    ('tokenize', Tokenizer()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(class_weight='balanced')))
])

In [43]:
# fit the weighted simple model
weighted_model = pipe_weighted.fit(X_train, Y_train)

In [44]:
# evaluate all steps on the test set
Y_pred_weighted = weighted_model.predict(X_test)

In [45]:
# get the classification report as a dataframe
weighted_model_report = report_to_dataframe(Y_test, Y_pred_weighted, categories)[0] 
weighted_model_report.sort_values(by='f1-score', ascending=False) 

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
0,related,0.86,0.93,0.89,0.83,5974.0
31,earthquake,0.92,0.73,0.81,0.97,739.0
27,weather_related,0.85,0.7,0.77,0.88,2180.0
3,aid_related,0.75,0.71,0.73,0.78,3276.0
10,food,0.85,0.54,0.66,0.94,899.0
1,request,0.8,0.52,0.63,0.9,1330.0
29,storm,0.78,0.45,0.57,0.94,705.0
34,direct_report,0.77,0.41,0.53,0.86,1539.0
28,floods,0.9,0.35,0.5,0.94,648.0
9,water,0.83,0.33,0.48,0.95,516.0


In [46]:
# averages the simple improved model
weighted_model_avg_report = report_to_dataframe(Y_test, Y_pred_weighted, categories)[1] 
weighted_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.82,0.52,0.64,24887.0
macro avg,0.6,0.19,0.24,24887.0
weighted avg,0.76,0.52,0.56,24887.0
samples avg,0.64,0.46,0.49,24887.0


In [47]:
# we can also print the individual classification reports
for i in range(len(categories)):
    print(i, Y_test.columns[i])
    print(classification_report(Y_test.iloc[:, i].values, Y_pred_weighted[:, i],
                               zero_division=0))

0 related
              precision    recall  f1-score   support

           0       0.69      0.52      0.59      1849
           1       0.86      0.93      0.89      5974

    accuracy                           0.83      7823
   macro avg       0.77      0.72      0.74      7823
weighted avg       0.82      0.83      0.82      7823

1 request
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      6493
           1       0.80      0.52      0.63      1330

    accuracy                           0.90      7823
   macro avg       0.86      0.75      0.79      7823
weighted avg       0.89      0.90      0.89      7823

2 offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7776
           1       1.00      0.04      0.08        47

    accuracy                           0.99      7823
   macro avg       1.00      0.52      0.54      7823
weighted avg       0.99      0.99      0.99   

In [48]:
print('Simple model scores:') 
print(simple_model_avg_report['f1-score'])
print('')
print('Balanced classes model scores:') 
print(weighted_model_avg_report['f1-score'])

Simple model scores:
micro avg       0.60
macro avg       0.21
weighted avg    0.52
samples avg     0.47
Name: f1-score, dtype: float64

Balanced classes model scores:
micro avg       0.64
macro avg       0.24
weighted avg    0.56
samples avg     0.49
Name: f1-score, dtype: float64


<div class="alert alert-block alert-info"> <b>NOTES:</b>
    <br>

There is no significant improvement in the model by using balanced class weights. 
</div>

#### Use XGBoost with MultiOutputClassifier

In [49]:
# create a pipeline with XGBClassifier wraped in a MultiOutputClassifier
pipe_xgb  = Pipeline([
    ('tokenize', Tokenizer()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(XGBClassifier(eval_metric='logloss',
                                                use_label_encoder=False))),
])

In [50]:
# fit XGBClassifier model
xgb_model = pipe_xgb.fit(X_train, Y_train)

In [51]:
# export the xgb model as a pickle file
#with open(path / 'models/xgb_model.pkl', 'wb') as f:
    #pickle.dump(xgb_model, f)

In [52]:
# evaluate all steps on the test set
Y_pred_xgb = xgb_model.predict(X_test)

In [53]:
# get the classification report as a dataframe
xgb_model_report = report_to_dataframe(Y_test, Y_pred_xgb, categories)[0] 
xgb_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
0,related,0.83,0.95,0.89,0.82,5974.0
31,earthquake,0.9,0.85,0.87,0.98,739.0
10,food,0.8,0.79,0.8,0.95,899.0
27,weather_related,0.86,0.72,0.78,0.89,2180.0
9,water,0.8,0.7,0.75,0.97,516.0
29,storm,0.75,0.69,0.72,0.95,705.0
3,aid_related,0.78,0.65,0.71,0.77,3276.0
28,floods,0.87,0.59,0.71,0.96,648.0
11,shelter,0.74,0.59,0.66,0.95,701.0
1,request,0.78,0.53,0.63,0.89,1330.0


In [54]:
# averages the xgb model
xgb_model_avg_report = report_to_dataframe(Y_test, Y_pred_xgb, categories)[1] 
xgb_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.79,0.61,0.69,24887.0
macro avg,0.65,0.36,0.43,24887.0
weighted avg,0.75,0.61,0.65,24887.0
samples avg,0.65,0.52,0.53,24887.0


In [55]:
# we can also print the individual classification reports
for i in range(len(categories)):
    print(i, Y_test.columns[i])
    print(classification_report(Y_test.iloc[:, i].values, Y_pred_xgb[:, i],
                               zero_division=0))

0 related
              precision    recall  f1-score   support

           0       0.71      0.37      0.49      1849
           1       0.83      0.95      0.89      5974

    accuracy                           0.82      7823
   macro avg       0.77      0.66      0.69      7823
weighted avg       0.80      0.82      0.79      7823

1 request
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      6493
           1       0.78      0.53      0.63      1330

    accuracy                           0.89      7823
   macro avg       0.85      0.75      0.78      7823
weighted avg       0.89      0.89      0.89      7823

2 offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7776
           1       1.00      0.04      0.08        47

    accuracy                           0.99      7823
   macro avg       1.00      0.52      0.54      7823
weighted avg       0.99      0.99      0.99   

In [56]:
print('Simple model scores:') 
print(simple_model_avg_report['f1-score'])
print('')
print('XGB model scores:') 
print(xgb_model_avg_report['f1-score'])

Simple model scores:
micro avg       0.60
macro avg       0.21
weighted avg    0.52
samples avg     0.47
Name: f1-score, dtype: float64

XGB model scores:
micro avg       0.69
macro avg       0.43
weighted avg    0.65
samples avg     0.53
Name: f1-score, dtype: float64


<div class="alert alert-block alert-info"> <b>NOTES:</b>
    <br>

Focusing on the f1-values, there is a noticeable improvement in the XGBClasifier model over the RandomForestClassifier models. The biggest change is in the value of the macro average for F1 scores, which indicates that XGBClassifier performs better for the individual classes. We would expect a more significant improvement from more powerful methods such as pre-trained embeddings (Word2Vec) or using a language model (Bert) in conjunction with deep learning models.

**The micro average**  

The micro average has its name from the fact that it pools the performance over the smallest possible unit (i.e. over all samples). The micro-averaged precision, $p_{micro}$, and recall, $r_{micro}$, harmonic mean give the micro $F_1$-score:

$$p_{micro} = \frac{\sum _{i=1}^{N}TP_i}{\sum_{i=1}^N \left( TP_i + FP_i \right)} \hspace{1cm} r_{micro} = \frac{\sum _{i=1}^{N}TP_i}{\sum_{i=1}^N \left( TP_i + FN_i \right)} \hspace{1cm}
F_{1micro} = 2 \cdot \frac{p_{micro} \cdot r_{micro}}{p_{micro} + r_{micro}}$$

If a classifier has a large $F_{1micro}$, this indicates that it performs well overall. The micro-average is not sensitive to the predictive performance for individual classes. As a consequence, the micro-average can be particularly misleading when the class distribution is imbalanced.

**The macro average**  

The macro average takes its name from the fact that it averages over larger groups, namely over the performance for individual classes rather than observations. The macro-averaged precision $p_{macro}$ and recall $r_{macro}$ give rise to the macro $F_1$-score:

$$p_{macro} = \sum_i^N \frac{TP_i}{ TP_i + FP_i} \hspace{1cm} r_{macro} = \sum_i^N \frac{TP_i}{ TP_i + FN_i} \hspace{1cm}F_{1macro} = 2 \cdot \frac{p_{macro} \cdot r_{macro}}{p_{macro} + r_{macro}}$$

A large value of the $F_{1macro}$ indicates that a classifier performs well for each individual class. The macro-average is therefore more suitable for data with an imbalanced class distribution.
</div>

#### Tune the  XGB model

In [57]:
# first access the parameter keys of the individual estimators
print(pipe_xgb.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'tokenize', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__objective', 'clf__estimator__use_label_encoder', 'clf__estimator__base_score', 'clf__estimator__booster', 'clf__estimator__colsample_bylevel', 'clf__estimator__colsample_bynode', 'clf__estimator__colsample_bytree', 'clf__estimator__gamma', 'clf__estimator__gpu_id', 'clf__estimator__importance_type', 'clf__estimator__interaction_constraints', 'clf__estimator__learning_rate', 'clf__estimator__max_delta_step', 'clf__estimator__max_depth', 'clf__estimator__min_child_weight', 'clf__estimator__missing', '

In [45]:
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way 
parameters_xgb = {
    'vect__min_df': [5, 10],
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__smooth_idf': [True, False],
    'tfidf__norm': ('l1', 'l2'),
    'clf__estimator__n_estimators': [100, 500, 1000], 
    'clf__estimator__learning_rate': [0.1, 0.01],
    'clf__estimator__max_depth': [4, 6],
}


xgb_search = RandomizedSearchCV(pipe_xgb,
                            param_distributions=parameters_xgb,
                            n_iter=4,
                            scoring='f1_micro',
                            n_jobs=1, # warning: does not work with -1
                            cv=3,
                            verbose=4,
                            random_state=42)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe_xgb.steps])
print("parameters:")
print(parameters_xgb)

t0 = time()
xgb_search.fit(X_train[0:1000], Y_train[0:1000])

print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % xgb_search.best_score_)
print("Best parameters set:")
best_parameters_xgb = xgb_search.best_estimator_.get_params()

for param_name in sorted(parameters_xgb.keys()):
    print("\t%s: %r" % (param_name, best_parameters_xgb[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__min_df': [5, 10], 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__smooth_idf': [True, False], 'tfidf__norm': ('l1', 'l2'), 'clf__estimator__n_estimators': [100, 500, 1000], 'clf__estimator__learning_rate': [0.1, 0.01], 'clf__estimator__max_depth': [4, 6]}
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END clf__estimator__learning_rate=0.1, clf__estimator__max_depth=4, clf__estimator__n_estimators=1000, tfidf__norm=l2, tfidf__smooth_idf=False, vect__min_df=5, vect__ngram_range=(1, 2);, score=0.696 total time=  10.7s
[CV 2/3] END clf__estimator__learning_rate=0.1, clf__estimator__max_depth=4, clf__estimator__n_estimators=1000, tfidf__norm=l2, tfidf__smooth_idf=False, vect__min_df=5, vect__ngram_range=(1, 2);, score=0.738 total time=  11.4s
[CV 3/3] END clf__estimator__learning_rate=0.1, clf__estimator__max_depth=4, clf__estimator__n_estimators=1000, tfidf__norm=l2, tfidf__smooth_idf=Fals

In [58]:
# best score: 0.689 and the corresponding parameters
best_parameters_xgb = {
    'clf__estimator__learning_rate': 0.1,
    'clf__estimator__max_depth': 6,
    'clf__estimator__n_estimators': 500,
    'tfidf__norm': 'l2',
    'tfidf__smooth_idf': False,
    'vect__min_df': 5,
    'vect__ngram_range': (1, 1),
}

In [60]:
# set the parameters and fit the tuned xgb_model
pipe_xgb.set_params(**best_parameters_xgb)
# fit the improved xgb model
xgb_tuned_model = pipe_xgb.fit(X_train, Y_train)

# alternate construction of pipeline
#xgb_tuned_model = xgb_search.best_estimator_

In [61]:
# export the simple model as a pickle file
#with open(path / 'models/xgb_tuned_model.pkl', 'wb') as f:
#    pickle.dump(xgb_tuned_model, f)

In [62]:
# evaluate all steps on the test set and make predictions
Y_pred_xgb_tuned = xgb_tuned_model.predict(X_test)

In [63]:
# get the classification report as a dataframe
xgb_tuned_model_report = report_to_dataframe(Y_test, Y_pred_xgb_tuned, categories)[0] 
xgb_tuned_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
0,related,0.84,0.95,0.89,0.82,5974.0
31,earthquake,0.89,0.84,0.87,0.98,739.0
27,weather_related,0.86,0.73,0.79,0.89,2180.0
10,food,0.8,0.77,0.79,0.95,899.0
29,storm,0.76,0.7,0.73,0.95,705.0
9,water,0.79,0.67,0.73,0.97,516.0
3,aid_related,0.78,0.66,0.71,0.78,3276.0
28,floods,0.86,0.6,0.71,0.96,648.0
11,shelter,0.75,0.58,0.65,0.94,701.0
1,request,0.79,0.53,0.64,0.9,1330.0


In [64]:
# averages the xgb model
xgb_tuned_model_avg_report = report_to_dataframe(Y_test, Y_pred_xgb_tuned, categories)[1] 
xgb_tuned_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.79,0.61,0.69,24887.0
macro avg,0.63,0.36,0.43,24887.0
weighted avg,0.75,0.61,0.66,24887.0
samples avg,0.65,0.52,0.53,24887.0


In [65]:
# we can also print the individual classification reports
for i in range(len(categories)):
    print(i, Y_test.columns[i])
    print(classification_report(Y_test.iloc[:, i].values, Y_pred_xgb_tuned[:, i],
                               zero_division=0))

0 related
              precision    recall  f1-score   support

           0       0.72      0.41      0.53      1849
           1       0.84      0.95      0.89      5974

    accuracy                           0.82      7823
   macro avg       0.78      0.68      0.71      7823
weighted avg       0.81      0.82      0.81      7823

1 request
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      6493
           1       0.79      0.53      0.64      1330

    accuracy                           0.90      7823
   macro avg       0.85      0.75      0.79      7823
weighted avg       0.89      0.90      0.89      7823

2 offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7776
           1       0.67      0.04      0.08        47

    accuracy                           0.99      7823
   macro avg       0.83      0.52      0.54      7823
weighted avg       0.99      0.99      0.99   

In [66]:
print('XGB model scores:') 
print(xgb_model_avg_report['f1-score'])
print('')
print('XGB tuned model scores:') 
print(xgb_tuned_model_avg_report['f1-score'])

XGB model scores:
micro avg       0.69
macro avg       0.43
weighted avg    0.65
samples avg     0.53
Name: f1-score, dtype: float64

XGB tuned model scores:
micro avg       0.69
macro avg       0.43
weighted avg    0.66
samples avg     0.53
Name: f1-score, dtype: float64


<div class="alert alert-block alert-info"> <b>NOTES:</b>
    <br>

If we just compare the f1 average scores of the xgb model before and after tuning, we don't see any differences. We can just use the default parameters. 
</div>

#### Using Classifiers Chain

In [69]:
# create a pipeline with XGBClassifier wraped in a Classifier Chain
pipe_chain  = Pipeline([
    ('tokenize', Tokenizer()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', ClassifierChain(XGBClassifier(eval_metric='logloss',
                                                use_label_encoder=False))),
])

In [70]:
# fit XGBClassifier model
chain_model = pipe_chain.fit(X_train, Y_train)

In [71]:
# evaluate all steps on the test set
Y_pred_chain = chain_model.predict(X_test)

In [72]:
# export the model as a pickle file
#with open(path / 'models/chain_model.pkl', 'wb') as f:
#    pickle.dump(chain_model, f)

In [73]:
# adjust for classifier chains by removing accuracy per class

def report_to_dataframe_chain(y_true, y_pred, categories):
    """
    Function to save the sklearn classification report as a
    pandas dataframe.
    INPUT: 
        y_true (pd.Dataframe) - the true labels 
        y_pred (np.array) - the predicted labels
        categories (list) - the list of predicted labels 
    OUTPUT: 
        reports (list) - a list of two dataframes

    where:
        reports[0] (pd.DataFrame) - contains precision, recall, f1-score, for each label
        reports[1] (pd.DataFrame) - contains overall averages of the scores of precision, 
                                    recall, f1-score.
    """
    # save classification report in dictionary form
    report_dict = classification_report(y_true, y_pred, 
                              target_names=categories,
                                    zero_division=0,
                              output_dict=True)
    # save the report as a datafarme
    report_df = pd.DataFrame.from_dict(report_dict).T.round(2)
    # drop the last 4 rows as they represent averages
    report = report_df[:-4]
    # save the averages in a separate dataframe
    report_avg = report_df[-4:]
    
    # convert the index into a column, rename it to Category
    report=report.reset_index().rename({'index':'category'},
                                       axis = 'columns')
    # drop the support column
    #report.drop(columns=['support'], inplace=True)
    
    # create a tuple of reports
    reports = [report, report_avg]
    return reports

In [74]:
# get the classification report as a dataframe 
chain_model_report = report_to_dataframe_chain(Y_test, Y_pred_chain, categories)[0] 
chain_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,support
0,related,0.83,0.95,0.89,5974.0
31,earthquake,0.88,0.83,0.85,739.0
10,food,0.78,0.81,0.8,899.0
27,weather_related,0.86,0.71,0.78,2180.0
9,water,0.77,0.77,0.77,516.0
3,aid_related,0.75,0.68,0.71,3276.0
29,storm,0.73,0.68,0.7,705.0
28,floods,0.87,0.59,0.7,648.0
16,death,0.7,0.65,0.68,330.0
1,request,0.77,0.57,0.66,1330.0


In [75]:
# averages the xgb model - use report_to_dataframe_chain
chain_model_avg_report = report_to_dataframe_chain(Y_test, Y_pred_chain, categories)[1] 
chain_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.76,0.63,0.69,24887.0
macro avg,0.6,0.38,0.44,24887.0
weighted avg,0.73,0.63,0.66,24887.0
samples avg,0.64,0.54,0.53,24887.0


In [76]:
print('XGB model scores:')
print(xgb_model_avg_report['f1-score'])
print('')
print('Chain model scores:')
print(chain_model_avg_report['f1-score'])

XGB model scores:
micro avg       0.69
macro avg       0.43
weighted avg    0.65
samples avg     0.53
Name: f1-score, dtype: float64

Chain model scores:
micro avg       0.69
macro avg       0.44
weighted avg    0.66
samples avg     0.53
Name: f1-score, dtype: float64


#### Chain Classifiers with Ordered Labels

In [77]:
# order in which the labels are processed by the classifiers in the chain
label_list = list(xgb_tuned_model_report.sort_values(by='support', ascending=False).index)

In [78]:
# create a pipeline with XGBClassifier wraped in a Classifier Chain
pipe_ordered_chain  = Pipeline([
    ('tokenizer', Tokenizer()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', ClassifierChain(XGBClassifier(eval_metric='logloss',
                                                use_label_encoder=False), order=label_list)),
])

In [79]:
# fit XGBClassifier model
ordered_chain_model = pipe_ordered_chain.fit(X_train, Y_train)

In [80]:
# evaluate all steps on the test set
Y_pred_ordered_chain = ordered_chain_model.predict(X_test)

In [81]:
# export the model as a pickle file
#with open(path / 'models/ordered_chain_model.pkl', 'wb') as f:
#    pickle.dump(ordered_chain_model, f)

In [82]:
# get the classification report as a dataframe
ordered_chain_model_report = report_to_dataframe_chain(Y_test, 
                                                       Y_pred_ordered_chain, 
                                                       categories)[0] 
ordered_chain_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,support
0,related,0.83,0.95,0.89,5974.0
1,request,0.32,0.79,0.45,1330.0
3,aid_related,0.75,0.27,0.39,3276.0
9,water,0.21,0.25,0.23,516.0
4,medical_help,0.11,0.2,0.14,599.0
15,refugees,0.14,0.12,0.13,260.0
18,infrastructure_related,0.13,0.08,0.1,502.0
11,shelter,0.14,0.08,0.1,701.0
19,transport,0.21,0.06,0.09,349.0
10,food,0.14,0.07,0.09,899.0


In [83]:
# averages the chain of xgb classifiers with ordered labels
ordered_chain_model_avg_report = report_to_dataframe_chain(Y_test, 
                                                           Y_pred_ordered_chain, 
                                                           categories)[1] 
ordered_chain_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.38,0.33,0.36,24887.0
macro avg,0.13,0.11,0.09,24887.0
weighted avg,0.39,0.33,0.32,24887.0
samples avg,0.39,0.35,0.33,24887.0


In [84]:
print('Ordered XGB Chain model scores:') 
print(ordered_chain_model_avg_report['f1-score'])
print('')
print('XGB Chain model scores:') 
print(chain_model_avg_report['f1-score'])

Ordered XGB Chain model scores:
micro avg       0.36
macro avg       0.09
weighted avg    0.32
samples avg     0.33
Name: f1-score, dtype: float64

XGB Chain model scores:
micro avg       0.69
macro avg       0.44
weighted avg    0.66
samples avg     0.53
Name: f1-score, dtype: float64


### Using AdaBoostClassifier

In [17]:
# create a pipeline with AdaBoostClassifier wraped in a MultiOutputClassifier
pipe_ada  = Pipeline([
    ('tokenize', Tokenizer()),
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier())),
])

In [19]:
# train the pipeline on the train set
ada_model = pipe_ada.fit(X_train, Y_train)

In [20]:
# evaluate all steps on the test set
Y_pred_ada = ada_model.predict(X_test)

In [21]:
# get the classification report as a dataframe
ada_model_report = report_to_dataframe(Y_test,Y_pred_ada, categories)[0] 
ada_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
0,related,0.79,0.97,0.87,0.79,5899.0
31,earthquake,0.9,0.79,0.84,0.97,741.0
27,weather_related,0.87,0.66,0.75,0.88,2179.0
10,food,0.8,0.63,0.7,0.94,854.0
28,floods,0.85,0.59,0.69,0.96,644.0
3,aid_related,0.75,0.62,0.68,0.76,3205.0
9,water,0.71,0.63,0.67,0.96,484.0
11,shelter,0.77,0.51,0.62,0.94,692.0
29,storm,0.74,0.53,0.62,0.94,719.0
1,request,0.74,0.49,0.59,0.89,1301.0


In [22]:
# averages the chain of xgb classifiers with ordered labels
ada_model_avg_report = report_to_dataframe(Y_test, Y_pred_ada, categories)[1] 
ada_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.76,0.58,0.66,24315.0
macro avg,0.56,0.32,0.4,24315.0
weighted avg,0.72,0.58,0.62,24315.0
samples avg,0.65,0.51,0.53,24315.0


In [23]:
# first access the parameter keys of the individual estimators
print(pipe_ada.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'tokenize', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__algorithm', 'clf__estimator__base_estimator', 'clf__estimator__learning_rate', 'clf__estimator__n_estimators', 'clf__estimator__random_state', 'clf__estimator', 'clf__n_jobs'])


In [35]:
print(pipe_ada.get_params())

{'memory': None, 'steps': [('tokenize', Tokenizer()), ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(estimator=AdaBoostClassifier()))], 'verbose': False, 'tokenize': Tokenizer(), 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'clf': MultiOutputClassifier(estimator=AdaBoostClassifier()), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'clf__estimator__algorithm': 'SAMME.R', 'clf__estimator__base_estimator': None, 'clf__

In [28]:
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way 
parameters_ada = {
    'vect__min_df': [5, 10],
    #'vect__ngram_range': ((1, 1)),  # unigrams or bigrams
    #'tfidf__smooth_idf': [True, False],
    'tfidf__norm': ('l1', 'l2'),
    'clf__estimator__n_estimators': [100, 150], 
    'clf__estimator__learning_rate': [0.1, 0.01],
}


ada_search = GridSearchCV(pipe_ada,
                            param_grid=parameters_ada,
                            scoring='f1_micro',
                            n_jobs=1, # warning: does not work with -1
                            cv=3,
                            verbose=4)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe_ada.steps])
print("parameters:")
print(parameters_ada)

t0 = time()
ada_search.fit(X_train, Y_train)

print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % ada_search.best_score_)
print("Best parameters set:")
best_parameters_ada = ada_search.best_estimator_.get_params()

for param_name in sorted(parameters_ada.keys()):
    print("\t%s: %r" % (param_name, best_parameters_ada[param_name]))

Performing grid search...
pipeline: ['tokenize', 'vect', 'tfidf', 'clf']
parameters:
{'vect__min_df': [5, 10], 'tfidf__norm': ('l1', 'l2'), 'clf__estimator__n_estimators': [100, 150], 'clf__estimator__learning_rate': [0.1, 0.01]}
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV 1/3] END clf__estimator__learning_rate=0.1, clf__estimator__n_estimators=100, tfidf__norm=l1, vect__min_df=5;, score=0.604 total time= 1.3min
[CV 2/3] END clf__estimator__learning_rate=0.1, clf__estimator__n_estimators=100, tfidf__norm=l1, vect__min_df=5;, score=0.598 total time= 1.3min
[CV 3/3] END clf__estimator__learning_rate=0.1, clf__estimator__n_estimators=100, tfidf__norm=l1, vect__min_df=5;, score=0.604 total time= 1.3min
[CV 1/3] END clf__estimator__learning_rate=0.1, clf__estimator__n_estimators=100, tfidf__norm=l1, vect__min_df=10;, score=0.606 total time= 1.3min
[CV 2/3] END clf__estimator__learning_rate=0.1, clf__estimator__n_estimators=100, tfidf__norm=l1, vect__min_df=10;, score=0.

NameError: name 'best_parameters_xgb' is not defined

In [29]:
print("Best parameters set:")
best_parameters_ada = ada_search.best_estimator_.get_params()

for param_name in sorted(parameters_ada.keys()):
    print("\t%s: %r" % (param_name, best_parameters_ada[param_name]))

Best parameters set:
	clf__estimator__learning_rate: 0.1
	clf__estimator__n_estimators: 150
	tfidf__norm: 'l1'
	vect__min_df: 5


In [30]:
# the construction of the pipeline
ada_tuned_model = ada_search.best_estimator_

In [31]:
# evaluate all steps on the test set and make predictions
Y_pred_ada_tuned = ada_tuned_model.predict(X_test)

In [32]:
# get the classification report as a dataframe
ada_tuned_model_report = report_to_dataframe(Y_test, Y_pred_ada_tuned, categories)[0] 
ada_tuned_model_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
0,related,0.77,0.99,0.87,0.77,5899.0
31,earthquake,0.91,0.8,0.85,0.97,741.0
10,food,0.78,0.72,0.75,0.95,854.0
27,weather_related,0.9,0.55,0.69,0.86,2179.0
28,floods,0.91,0.5,0.65,0.95,644.0
9,water,0.74,0.57,0.65,0.96,484.0
3,aid_related,0.76,0.49,0.6,0.73,3205.0
11,shelter,0.82,0.43,0.56,0.94,692.0
1,request,0.8,0.39,0.52,0.88,1301.0
29,storm,0.79,0.32,0.45,0.93,719.0


In [33]:
# averages the xgb model
ada_tuned_model_avg_report = report_to_dataframe(Y_test, Y_pred_ada_tuned, categories)[1] 
ada_tuned_model_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.79,0.51,0.62,24315.0
macro avg,0.64,0.23,0.29,24315.0
weighted avg,0.76,0.51,0.55,24315.0
samples avg,0.71,0.48,0.52,24315.0


In [34]:
print('AdaBoost model scores:') 
print(ada_model_avg_report['f1-score'])
print('')
print('AdaBoost tuned model scores:') 
print(ada_tuned_model_avg_report['f1-score'])

AdaBoost model scores:
micro avg       0.66
macro avg       0.40
weighted avg    0.62
samples avg     0.53
Name: f1-score, dtype: float64

AdaBoost tuned model scores:
micro avg       0.62
macro avg       0.29
weighted avg    0.55
samples avg     0.52
Name: f1-score, dtype: float64


In [48]:
# compare the f1 scores of the AdaBoost models with the simple model
combined_report = pd.DataFrame(data={'f1-simple': simple_model_avg_report['f1-score'],
                                     'f1-ada': ada_model_avg_report['f1-score'],
                                     'f1-ada-tuned': ada_tuned_model_avg_report['f1-score']},
                               index=simple_model_avg_report.index)
combined_report

Unnamed: 0,f1-simple,f1-ada,f1-ada-tuned
micro avg,0.6,0.66,0.62
macro avg,0.21,0.4,0.29
weighted avg,0.53,0.62,0.55
samples avg,0.47,0.53,0.52


### Conclusions

In [85]:
# compare the f1 scores of the three best models
combined_report = pd.DataFrame(data={'f1-simple': simple_model_avg_report['f1-score'],
                                     'f1-xgb': xgb_model_avg_report['f1-score'],
                                     'f1-chain': chain_model_avg_report['f1-score']},
                               index=simple_model_avg_report.index)
combined_report

Unnamed: 0,f1-simple,f1-xgb,f1-chain
micro avg,0.6,0.69,0.69
macro avg,0.21,0.43,0.44
weighted avg,0.52,0.65,0.66
samples avg,0.47,0.53,0.53


In [101]:
# average values for the metrics of the best two models

compared_avg = pd.concat([xgb_model_avg_report.drop(columns=['support']), chain_model_avg_report.drop(columns=['support'])], axis=1)
compared_avg.columns=['p_xgb', 'r_xgb', 'f1_xgb', 'p_chain', 'r_chain', 'f1_chain']
#compared_avg.style.background_gradient(cmap='coolwarm')
compared_avg

Unnamed: 0,p_xgb,r_xgb,f1_xgb,p_chain,r_chain,f1_chain
micro avg,0.79,0.61,0.69,0.76,0.63,0.69
macro avg,0.65,0.36,0.43,0.6,0.38,0.44
weighted avg,0.75,0.61,0.65,0.73,0.63,0.66
samples avg,0.65,0.52,0.53,0.64,0.54,0.53


<div class="alert alert-block alert-info"> <b>NOTES:</b>
    <br>
Precision can be seen as a measure of quality, and recall as a measure of quantity. Higher precision means that an algorithm returns more relevant results than irrelevant ones, and high recall means that an algorithm returns most of the relevant results (whether or not irrelevant ones are also returned).

Therefore, if we compare the two models that got the best $F_1$ scores, the XGBClassifier with MultiOutput wrapper versus the XGBClassifier with the ClassifierChain wrapper, we notice that the former has better precision scores and we choose it as our final model.
</div>

### Export your model as a pickle file


In [36]:
# the best model from all the above is xgb_model
ada_model = ada_tuned_model

# export the xgb model as a pickle file
with open(path / 'models/ada_model.pkl', 'wb') as f:
    pickle.dump(ada_model, f)

In [37]:
# load the model from the pickle file
model = pickle.load(open(path/ 'models/ada_model.pkl', 'rb'))

In [38]:
# choose several messages to test the model on
X[1369]

"What can I do to go to Senegal right away because I won't be able to live here anymore"

In [39]:
category_predicted = model.predict([X[1369]])[0]
category_predicted

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [40]:
# link the output of the model with the category
def get_predictions(category_predicted):
    return [categories[i] for i in range(len(category_predicted)) if category_predicted[i] == 1]

In [41]:
# print the prediction(s)
get_predictions(category_predicted)

['related']

In [42]:
# choose a second message to test the model and get the predictions
print(X[14588])
print()
print(get_predictions(model.predict([X[14588]])[0]))

The subduction zone surrounding the immediate region of this event has not witnessed a megathrust earthquake in the recent past, rupturing last in an earthquake of M 8.5 or larger in 1797.Hayat said he had seen cases of disease and illness but nothing major.

['related', 'weather_related', 'earthquake']


#### The XGB model with the alternate dataset

In [95]:
# check the xgb model on the alternate database

# fit the model on the alternate dataset
xgb_model_opt = pipe_xgb.fit(X_opt_train, Y_opt_train)

# estimate on test set
Y_opt_pred_xgb = xgb_model_opt.predict(X_opt_test)

# get the classification report as a dataframe
xgb_model_opt_report = report_to_dataframe(Y_opt_test, Y_opt_pred_xgb, categories_opt)[0] 
xgb_model_opt_report.sort_values(by='f1-score', ascending=False)

Unnamed: 0,category,precision,recall,f1-score,accuracy,support
9,food,0.92,0.88,0.9,0.96,863.0
2,aid_related,0.87,0.91,0.89,0.83,3299.0
30,earthquake,0.95,0.85,0.89,0.97,746.0
26,weather_related,0.93,0.82,0.87,0.88,2200.0
8,water,0.85,0.78,0.82,0.96,482.0
28,storm,0.82,0.76,0.79,0.93,713.0
0,request,0.84,0.72,0.78,0.87,1359.0
27,floods,0.89,0.61,0.73,0.93,652.0
10,shelter,0.84,0.64,0.73,0.93,669.0
33,direct_report,0.78,0.66,0.71,0.82,1524.0


In [96]:
# averages the xgb and alternate dataset
xgb_model_opt_avg_report = report_to_dataframe(Y_opt_test, Y_opt_pred_xgb, categories_opt)[1] 
xgb_model_opt_avg_report

Unnamed: 0,precision,recall,f1-score,support
micro avg,0.84,0.63,0.72,19011.0
macro avg,0.73,0.43,0.51,19011.0
weighted avg,0.8,0.63,0.69,19011.0
samples avg,0.83,0.69,0.71,19011.0


In [97]:
print('XGB model scores:')
print(xgb_model_avg_report['f1-score'])
print('')
print('XGB model scores on alternate data:')
print(xgb_model_opt_avg_report['f1-score'])

XGB model scores:
micro avg       0.69
macro avg       0.43
weighted avg    0.65
samples avg     0.53
Name: f1-score, dtype: float64

XGB model scores on alternate data:
micro avg       0.72
macro avg       0.51
weighted avg    0.69
samples avg     0.71
Name: f1-score, dtype: float64


### Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.