This notebook is the pre construct of the ML pipeline of the main database

Importing the libraries needed and the dataframes

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
import sqlite3
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
import statistics




def load_data():
    '''loading the messages database'''
    
    #opening the connect and reading the database
    conn = sqlite3.connect('Messages.db')
    df = pd.read_sql('SELECT * FROM Messages', conn)
    df = df.drop(columns=['index'])
    
    #storing the database into X,y
    X = df['message'].values#first scenario will ignore the genre feature
    y= df[df.columns.difference(['message','genre_news','genre_social'])]
    
    #closing connection
    conn.close()
    
    return X,y;



X, y = load_data()

At a first glance it seems better to treat each message as a document and build a document-term matrix, we may however end up with a matrix with too many columns, but we'll evaluate this later on. But first we'll clean the text: Normalize followed by tokenize then removing stop words and finally lemmatize

In [2]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def tokenize(text):
    # normalize case, remove punctuation and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    #lemmatize verbs
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]
    
    #lemmatize adjectives
    tokens = [lemmatizer.lemmatize(word, pos='a') for word in tokens]
    
    #lemmatize adverbs
    tokens = [lemmatizer.lemmatize(word, pos='r') for word in tokens]
    
    

    return tokens

In [3]:
def model_pipeline():
    '''Pipeline for a model with the default parameters'''
    
    pipeline = Pipeline([
        ('vect',CountVectorizer(tokenizer=tokenize)),
        ('tfidf',TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=RandomForestClassifier(random_state=42)))
    ])
    
    return pipeline

First we look at how the default RandomForestClassifier behaves then we will conduct a grid search to enhance it.

In [4]:
random_state=42
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=random_state)

model = model_pipeline()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

The evaluation on the model will be based on:
f1_score, accuracy, precision, recall and area under the roc curve.
Since this is a multi output classifier we need to build functions so we can look at each of this scores per label. These functions are created below

In [5]:
def AUC_ROC(y_test,y_pred):
    '''Calculates the area under the ROC curve for every label and returns the list
    Also displays the mean, maximum and minimum values.
    '''
    
    auc = []
    for i in range (0,y_test.shape[1]):
        auc.append(roc_auc_score(y_test.iloc[:,i],y_pred[:,i]))
    
    print('Mean AUC: ',"%.2f" % statistics.mean(auc),'Max AUC:', "%.2f" % max(auc),'Min AUC:', "%.2f" % min (auc))
    return auc;



def f1_score_labels(y_test,y_pred):
    '''Calculates the f1 score for every label, displays it and returns the list
    Also displays the mean, maximum and minimum values.
    '''
    
    f1_score_model = []
    for i in range (0,y_test.shape[1]):
        f1_score_column = f1_score(y_test.iloc[:,i],y_pred[:,i])
        f1_score_model.append(f1_score_column)
        print('The f1 score for',y.columns[i],' was: ',"%.2f" % f1_score_column,'.')
    
    print('Mean f1 score: ',"%.2f" % statistics.mean(f1_score_model),'Max f1 score:',"%.2f" % max(f1_score_model),'Min f1 score:',"%.2f" % min (f1_score_model))
    return f1_score_model;



def precision_score_labels(y_test,y_pred):
    '''Calculates the precision score for every label, displays it and returns the list
    Also displays the mean, maximum and minimum values.
    '''
    
    precision_score_model = []
    for i in range (0,y_test.shape[1]):
        precision_score_column = precision_score(y_test.iloc[:,i],y_pred[:,i])
        precision_score_model.append(precision_score_column)
        print('The precision score for',y.columns[i],' was: ',"%.2f" % precision_score_column,'.')
    
    print('Mean precision score: ',"%.2f" % statistics.mean(precision_score_model),'Max precision score:',"%.2f" % max(precision_score_model),'Min precision score:',"%.2f" % min (precision_score_model))
    return precision_score_model;
    


def accuracy_score_labels (y_test,y_pred):
    '''Calculates the accuracy score for every label, displays it and returns the list
    Also displays the mean, maximum and minimum values.
    '''
    
    accuracy_score_model = []
    for i in range (0,y_test.shape[1]):
        accuracy_score_column = accuracy_score(y_test.iloc[:,i],y_pred[:,i])
        accuracy_score_model.append(accuracy_score_column)
        print('The accuracy score for',y.columns[i],' was: ',"%.2f" % accuracy_score_column,'.')
    
    print('Mean accuracy score: ',"%.2f" % statistics.mean(accuracy_score_model),'Max accuracy score:',"%.2f" % max(accuracy_score_model),'Min accuracy score:',"%.2f" % min (accuracy_score_model))
    return accuracy_score_model;



def recall_score_labels (y_test,y_pred):

    recall_score_model = []
    for i in range (0,y_test.shape[1]):
        recall_score_column = recall_score(y_test.iloc[:,i],y_pred[:,i])
        recall_score_model.append(recall_score_column)
        print('The recall score for',y.columns[i],' was: ',"%.2f" % recall_score_column,'.')
    
    print('Mean recall score: ',"%.2f" % statistics.mean(recall_score_model),'Max recall score:',"%.2f" % max(recall_score_model),'Min recall score:',"%.2f" % min (recall_score_model))
    return recall_score_model;

In [6]:
AUC_ROC(y_test,y_pred)

Mean AUC:  0.61 Max AUC: 0.90 Min AUC: 0.50


[0.4998723838693211,
 0.7250041984595332,
 0.5716914558019949,
 0.5945959169694078,
 0.5791808544036321,
 0.6395174020897542,
 0.7000298596595999,
 0.8970986840583733,
 0.5208788025318993,
 0.5086206896551724,
 0.7576626494337885,
 0.8488608594863163,
 0.5096153846153846,
 0.500500355595897,
 0.5702206915940399,
 0.5509168182324339,
 0.5781633507925902,
 0.5079365079365079,
 0.5296131029146247,
 0.5,
 0.5210582591970516,
 0.49933704587642536,
 0.5390043831683129,
 0.5503346688288976,
 0.7837048495294779,
 0.5174825174825175,
 0.4998710343048749,
 0.7142396274208881,
 0.5,
 0.7925445024306548,
 0.5,
 0.581768791972895,
 0.7595723304817448,
 0.8443566716013274]

In [7]:
f1_score_labels(y_test,y_pred)

The f1 score for aid_centers  was:  0.00 .
The f1 score for aid_related  was:  0.76 .
The f1 score for buildings  was:  0.25 .
The f1 score for clothing  was:  0.30 .
The f1 score for cold  was:  0.27 .
The f1 score for death  was:  0.42 .
The f1 score for direct_report  was:  0.56 .
The f1 score for earthquake  was:  0.85 .
The f1 score for electricity  was:  0.08 .
The f1 score for fire  was:  0.03 .
The f1 score for floods  was:  0.66 .
The f1 score for food  was:  0.77 .
The f1 score for hospitals  was:  0.04 .
The f1 score for infrastructure_related  was:  0.01 .
The f1 score for medical_help  was:  0.24 .
The f1 score for medical_products  was:  0.18 .
The f1 score for military  was:  0.26 .
The f1 score for missing_people  was:  0.03 .
The f1 score for money  was:  0.11 .
The f1 score for offer  was:  0.00 .
The f1 score for other_aid  was:  0.09 .
The f1 score for other_infrastructure  was:  0.00 .
The f1 score for other_weather  was:  0.14 .
The f1 score for refugees  was:  0.

[0.0,
 0.7572684246112239,
 0.24605678233438485,
 0.30476190476190473,
 0.2698412698412698,
 0.4181184668989547,
 0.5604463732176069,
 0.8513800424628449,
 0.08000000000000002,
 0.03389830508474576,
 0.6601941747572815,
 0.774757281553398,
 0.03773584905660378,
 0.006191950464396285,
 0.2437137330754352,
 0.18367346938775508,
 0.2594594594594595,
 0.03125,
 0.11009174311926605,
 0.0,
 0.09226594301221167,
 0.0,
 0.14426229508196722,
 0.17801047120418848,
 0.6862091938707529,
 0.06756756756756757,
 0.0,
 0.5654135338345865,
 0.0,
 0.688757396449704,
 0.0,
 0.27536231884057966,
 0.6587771203155818,
 0.8056665455866328]

f1_score with 0 values indicates us that the labels are imbalanced, conducting a grid search will help us get further insights about this behaviour.

In [8]:
precision_score_labels(y_test,y_pred)

The precision score for aid_centers  was:  0.00 .
The precision score for aid_related  was:  0.72 .
The precision score for buildings  was:  0.76 .
The precision score for clothing  was:  0.76 .
The precision score for cold  was:  0.89 .
The precision score for death  was:  0.80 .
The precision score for direct_report  was:  0.72 .
The precision score for earthquake  was:  0.90 .
The precision score for electricity  was:  0.83 .
The precision score for fire  was:  1.00 .
The precision score for floods  was:  0.90 .
The precision score for food  was:  0.84 .
The precision score for hospitals  was:  1.00 .
The precision score for infrastructure_related  was:  0.11 .
The precision score for medical_help  was:  0.65 .
The precision score for medical_products  was:  0.82 .
The precision score for military  was:  0.71 .
The precision score for missing_people  was:  1.00 .
The precision score for money  was:  0.67 .
The precision score for offer  was:  0.00 .
The precision score for other_aid

  _warn_prf(average, modifier, msg_start, len(result))


[0.0,
 0.718562874251497,
 0.7647058823529411,
 0.7619047619047619,
 0.8947368421052632,
 0.8,
 0.7197452229299363,
 0.9011235955056179,
 0.8333333333333334,
 1.0,
 0.8986784140969163,
 0.8382352941176471,
 1.0,
 0.1111111111111111,
 0.6494845360824743,
 0.8181818181818182,
 0.7058823529411765,
 1.0,
 0.6666666666666666,
 0.0,
 0.5573770491803278,
 0.0,
 0.6111111111111112,
 0.6538461538461539,
 0.7767722473604827,
 1.0,
 0.0,
 0.7800829875518672,
 0.0,
 0.7972602739726027,
 0.0,
 0.8085106382978723,
 0.8835978835978836,
 0.8485080336648814]

In [9]:
accuracy_score_labels (y_test,y_pred)

The accuracy score for aid_centers  was:  0.98 .
The accuracy score for aid_related  was:  0.73 .
The accuracy score for buildings  was:  0.94 .
The accuracy score for clothing  was:  0.98 .
The accuracy score for cold  was:  0.98 .
The accuracy score for death  was:  0.96 .
The accuracy score for direct_report  was:  0.82 .
The accuracy score for earthquake  was:  0.96 .
The accuracy score for electricity  was:  0.97 .
The accuracy score for fire  was:  0.99 .
The accuracy score for floods  was:  0.95 .
The accuracy score for food  was:  0.94 .
The accuracy score for hospitals  was:  0.99 .
The accuracy score for infrastructure_related  was:  0.92 .
The accuracy score for medical_help  was:  0.90 .
The accuracy score for medical_products  was:  0.94 .
The accuracy score for military  was:  0.97 .
The accuracy score for missing_people  was:  0.98 .
The accuracy score for money  was:  0.98 .
The accuracy score for offer  was:  0.99 .
The accuracy score for other_aid  was:  0.83 .
The ac

[0.9849132511943676,
 0.7291928589388986,
 0.9399044505908977,
 0.981644455619814,
 0.976866985164697,
 0.9580085491576565,
 0.8217249182801106,
 0.9647975861201911,
 0.9710837314558712,
 0.9856675886346492,
 0.9471963791802867,
 0.9416645712848881,
 0.9871762635152125,
 0.9192858938898667,
 0.9016846869499623,
 0.9396530047774705,
 0.9655519235604727,
 0.9844103595675132,
 0.975609756097561,
 0.9937138546643198,
 0.8317827508171989,
 0.9469449333668595,
 0.9343726426954991,
 0.9605230072919286,
 0.8815690218757858,
 0.9653004777470455,
 0.9746039728438521,
 0.9273321599195373,
 0.993965300477747,
 0.9338697510686447,
 0.9934624088508927,
 0.9497108373145587,
 0.9564998742770933,
 0.8654764898164445]

In [10]:
recall_score_labels (y_test,y_pred)

The recall score for aid_centers  was:  0.00 .
The recall score for aid_related  was:  0.80 .
The recall score for buildings  was:  0.15 .
The recall score for clothing  was:  0.19 .
The recall score for cold  was:  0.16 .
The recall score for death  was:  0.28 .
The recall score for direct_report  was:  0.46 .
The recall score for earthquake  was:  0.81 .
The recall score for electricity  was:  0.04 .
The recall score for fire  was:  0.02 .
The recall score for floods  was:  0.52 .
The recall score for food  was:  0.72 .
The recall score for hospitals  was:  0.02 .
The recall score for infrastructure_related  was:  0.00 .
The recall score for medical_help  was:  0.15 .
The recall score for medical_products  was:  0.10 .
The recall score for military  was:  0.16 .
The recall score for missing_people  was:  0.02 .
The recall score for money  was:  0.06 .
The recall score for offer  was:  0.00 .
The recall score for other_aid  was:  0.05 .
The recall score for other_infrastructure  was: 

[0.0,
 0.800381133873273,
 0.14661654135338345,
 0.19047619047619047,
 0.1588785046728972,
 0.2830188679245283,
 0.4588832487309645,
 0.806841046277666,
 0.04201680672268908,
 0.017241379310344827,
 0.5217391304347826,
 0.720216606498195,
 0.019230769230769232,
 0.0031847133757961785,
 0.15,
 0.10344827586206896,
 0.15894039735099338,
 0.015873015873015872,
 0.06,
 0.0,
 0.05029585798816568,
 0.0,
 0.08178438661710037,
 0.10303030303030303,
 0.6145584725536993,
 0.03496503496503497,
 0.0,
 0.44339622641509435,
 0.0,
 0.60625,
 0.0,
 0.16593886462882096,
 0.5251572327044025,
 0.7669432918395575]

In [11]:
cm_y1 = confusion_matrix(y_test.iloc[:,0],y_pred[:,0])
cm_y2 = confusion_matrix(y_test.iloc[:,1],y_pred[:,1])



cr_y0 = classification_report(y_test.iloc[:,0],y_pred[:,0])
cr_y9 = classification_report(y_test.iloc[:,9],y_pred[:,9])
cr_y13 = classification_report(y_test.iloc[:,13],y_pred[:,13])
cr_y19 = classification_report(y_test.iloc[:,19],y_pred[:,19])
cr_y21 = classification_report(y_test.iloc[:,21],y_pred[:,21])
cr_y26 = classification_report(y_test.iloc[:,26],y_pred[:,26])
cr_y28 = classification_report(y_test.iloc[:,28],y_pred[:,28])
cr_y30 = classification_report(y_test.iloc[:,30],y_pred[:,30])
cr_y31 = classification_report(y_test.iloc[:,31],y_pred[:,31])

  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
print (cr_y31)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3748
           1       0.81      0.17      0.28       229

    accuracy                           0.95      3977
   macro avg       0.88      0.58      0.62      3977
weighted avg       0.94      0.95      0.93      3977



Overall the results look promising. We will now conduct a grid search to check what can be improved.

In [13]:
CountVectorizer.get_params(CountVectorizer).keys()



dict_keys(['analyzer', 'binary', 'decode_error', 'dtype', 'encoding', 'input', 'lowercase', 'max_df', 'max_features', 'min_df', 'ngram_range', 'preprocessor', 'stop_words', 'strip_accents', 'token_pattern', 'tokenizer', 'vocabulary'])

In [14]:
TfidfTransformer.get_params(TfidfTransformer).keys()

dict_keys(['norm', 'smooth_idf', 'sublinear_tf', 'use_idf'])

In [15]:
MultiOutputClassifier.get_params(RandomForestClassifier()).keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [16]:
GridSearchCV.get_params(GridSearchCV).keys()

dict_keys(['cv', 'error_score', 'estimator', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [19]:
def build_model():
        pipeline = Pipeline([
        ('vect',CountVectorizer(tokenizer=tokenize)),
        ('tfidf',TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))
    ])

    # specify parameters for grid search
        parameters = {
            #'vect__ngram_range': ((1, 1), (1, 2)),
            #'vect__max_df': (0.5, 0.75, 1.0),
            #'vect__max_features': (None, 5000, 10000),
            #'tfidf__use_idf': (True, False),
            'clf__estimator__n_estimators': [50, 100],
            #'clf__estimator__min_samples_split': [2, 3, 4],
            'clf__estimator__random_state': [42]
        
    }

    

    # create grid search object
        cv = GridSearchCV(pipeline, param_grid=parameters,verbose=1,n_jobs=3)
    
        return cv

On a sidenote a grid search with more parameters was set, but due to time restrictions, and because our goal isn't a perfect model, it was decided to limit the parameters in order to advance in the project.

In [20]:
cv = build_model()
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed: 18.1min finished


In [21]:
cv.best_params_

{'clf__estimator__n_estimators': 100, 'clf__estimator__random_state': 42}

This is the same model that was tested before, and since it was already evaluated, we can proceed with the implementation.