In [None]:
import sys 
from sqlalchemy import create_engine
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.metrics import classification_report, fbeta_score
from sklearn.multioutput import MultiOutputClassifier

import re
import nltk
nltk.download(['punkt', 'wordnet'])
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
def load_data(database_filepath):
    """
    Function to load data from databse 
    and parse it into features
    """
    connect_str = f"sqlite:///{database_filepath}"
    engine = create_engine(connect_str)
    df = pd.read_sql("SELECT * FROM df", engine)
    #drop all child alone as it has all zeros only
    df = df.drop(['child_alone'],axis=1)
    X = df.message
    Y = df.iloc[:, 4:]
    categories = Y.columns.tolist()

    return X,Y,categories

In [None]:
def tokenize(text):
    """
    Function to extract features
    
    Arguments:
        text -> Text message which needs to be tokenized
    Output:
        clean_tokens -> List of tokens extracted from the provided text
    """

    # Replace all urls with a urlplaceholder string
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Extract all the urls from the provided text 
    detected_urls = re.findall(url_regex, text)
    
    # Replace url with a url placeholder string
    for detected_url in detected_urls:
        text = text.replace(detected_url, 'url_place_holder')
    

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    # List of clean tokens
    clean_tokens = [lemmatizer.lemmatize(w).lower().strip() for w in tokens]

    return clean_tokens

In [None]:
def build_model():
  """
  Function to build NLP pipeline
  """
  pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ]))
            
        ])),

        ('classifier', MultiOutputClassifier(RandomForestClassifier()))
    ])

  return pipeline

In [None]:
def evaluate_model(model, X_test, Y_test, category_names):
    """Print classification report for positive labels"""

    Y_predict = model.predict(X_test)
    for i, col in enumerate(Y_test):
      print(classification_report(Y_test[col], Y_predict[:, i]))

In [None]:
X, Y, category_names = load_data('/content/DisasterResponse.db')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print("Building model...")
model = build_model()

print("Training model...")
model.fit(X_train, Y_train) 

print("Evaluating model...")
evaluate_model(model, X_test, Y_test, category_names)

Building model...
Training model...
Evaluating model...
              precision    recall  f1-score   support

           0       0.75      0.28      0.40      1193
           1       0.81      0.97      0.89      4012
           2       0.50      0.08      0.13        39

    accuracy                           0.81      5244
   macro avg       0.69      0.44      0.47      5244
weighted avg       0.80      0.81      0.77      5244

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      4370
           1       0.88      0.47      0.61       874

    accuracy                           0.90      5244
   macro avg       0.89      0.73      0.78      5244
weighted avg       0.90      0.90      0.89      5244

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5209
           1       0.00      0.00      0.00        35

    accuracy                           0.99      5244
   macro avg      

In [None]:
# parameters = {
#         #'vect__ngram_range': ((1, 1), (1, 2)),
#         'clf__estimator__n_estimators': [5, 10]
#         #'clf__estimator__min_samples_split': [2, 3, 4]
#     }

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_grid = {'classifier__estimator__max_depth': [3, None],
                   'classifier__estimator__n_estimators': [10, 20, 40]}
cv = GridSearchCV(model, param_grid=parameters_grid, scoring='f1_micro', n_jobs=-1)
cv.fit(X_train, Y_train)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('count_vectorizer',
                                                                                         CountVectorizer(tokenizer=<function tokenize at 0x7f0ec9d6ccb0>)),
                                                                                        ('tfidf_transformer',
                                                                                         TfidfTransformer())]))])),
                                       ('classifier',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             n_jobs=-1,
             param_grid={'classifier__estimator__max_depth': [3, None],
                         'classifier__estimator__n_estimators': [10, 20, 40]},
             scoring='f1_micro')

In [None]:
y_prediction_test = cv.predict(X_test)
for i, col in enumerate(Y_test):
  print(classification_report(Y_test[col], y_prediction_test[:, i]))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1193
           1       0.77      1.00      0.87      4012
           2       0.00      0.00      0.00        39

    accuracy                           0.77      5244
   macro avg       0.26      0.33      0.29      5244
weighted avg       0.59      0.77      0.66      5244

              precision    recall  f1-score   support

           0       0.83      1.00      0.91      4370
           1       0.00      0.00      0.00       874

    accuracy                           0.83      5244
   macro avg       0.42      0.50      0.45      5244
weighted avg       0.69      0.83      0.76      5244

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5209
           1       0.00      0.00      0.00        35

    accuracy                           0.99      5244
   macro avg       0.50      0.50      0.50      5244
weighted avg       0

In [None]:
model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'features', 'classifier', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__text_pipeline', 'features__text_pipeline__memory', 'features__text_pipeline__steps', 'features__text_pipeline__verbose', 'features__text_pipeline__count_vectorizer', 'features__text_pipeline__tfidf_transformer', 'features__text_pipeline__count_vectorizer__analyzer', 'features__text_pipeline__count_vectorizer__binary', 'features__text_pipeline__count_vectorizer__decode_error', 'features__text_pipeline__count_vectorizer__dtype', 'features__text_pipeline__count_vectorizer__encoding', 'features__text_pipeline__count_vectorizer__input', 'features__text_pipeline__count_vectorizer__lowercase', 'features__text_pipeline__count_vectorizer__max_df', 'features__text_pipeline__count_vectorizer__max_features', 'features__text_pipeline__count_vectorizer__min_df', 'features__text_pipeline__count_vectorizer__ngram_range', '

In [None]:
# import numpy as np
# pred = model.predict(X_test)
# pred = np.argmax(pred, axis=1)
# pred.shape
# label
# Y_test = np.argmax(Y_test, axis=1)
# Y_test
# print(Y_test.shape, pred.shape)
# print(classification_report(Y_test, pred, target_names=category_names))
# print(Y_test[:5], pred[:5])
# pred[:5]
# np.array(Y_test[:5])
# np.array(pred[:5])

In [None]:
# Y_predict = model.predict(X_test)
# for i, col in enumerate(Y_test):
#     print(classification_report(Y_test[col], Y_predict[:, i]))

In [None]:
#Model accuracy score on test set
# Y_test_accuracy = (y_prediction_test == Y_test).mean()
# print(Y_test_accuracy)    