# ML Pipeline Preparation
## 1. Import libraries and load data from database.

In [None]:
# Import python libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd
import pickle

from pprint import pprint

import re
import sys


#Import sklearn
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sqlalchemy import create_engine

# Output of the kaggle data sources
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load data from database
engine = create_engine('sqlite:////kaggle/input/disasterresponse/DisasterResponse.db')
df = pd.read_sql_table('DisasterResponse', con=engine)

categories = df.columns[4:]

X = df[['message']].values[:, 0]
y = df[categories].values

df.head()

In [None]:
# Check the first line of 'X'
X[0]


In [None]:
# Check the first line of 'y'
y[0]

## 2. Normalize, lemmatize and tokenize text data

In [None]:
# Get an overview over the English stopwords
print(stopwords.words('english'))

In [None]:
# Define tokenize function to reduce message complexity

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text):
    """
    Tokenizes text data
    
    Arguments:
    text str: Messages as text data
    
    Returns:
    clean_tokens: Processed text after normalizing, tokenizing and lemmatizing
    """
        
    # Detect URLs
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')
        
    # Normalize and tokenize
    tokens = nltk.word_tokenize(re.sub(r"[^a-zA-Z0-9]", " ", text.lower()))
    
    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords.words('english')]

    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    
    return clean_tokens

In [None]:
# Check the results after tokenization
for message in X[:6]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')

## 3. Build a machine learning pipeline

In [None]:
# Setup a machine learning pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(class_weight='balanced')))
])

## 4. Train pipeline

In [None]:
# Split dataset into test and training parts
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Train classifier
pipeline.fit(X_train, y_train)

## 5. Test model

In [None]:
# Report accurancy, precision, recall and f1 score for each output category of the dataset.

def classification_report_output(y_true, y_pred):
    """
    Outputs accuracy, precision, recall and f1 score for each output category of the dataset.
    
    Arguments:
    y_true
    y_pred
    """
        
    for i in range(0, len(categories)):
        print(categories[i])
        print("\tAccuracy: {:.4f}\t\t% Precision: {:.4f}\t\t% Recall: {:.4f}\t\t% F1_score: {:.4f}".format(
            accuracy_score(y_true[:, i], y_pred[:, i]),
            precision_score(y_true[:, i], y_pred[:, i], average='weighted'),
            recall_score(y_true[:, i], y_pred[:, i], average='weighted'),
            f1_score(y_true[:, i], y_pred[:, i], average='weighted')
        ))


In [None]:
# Report metrics for training model
y_pred = pipeline.predict(X_train)
classification_report_output(y_train, y_pred)

In [None]:
# Report metrics for test model
y_pred = pipeline.predict(X_test)
classification_report_output(y_test, y_pred)

## 6. Improve model

In [None]:
# Use grid search to find best parameters
parameters = {'vect__ngram_range': ((1, 1), (1, 2)),
              'vect__max_df': (0.75, 1.0)
              }

cv = GridSearchCV(estimator=pipeline, param_grid=parameters)
cv.fit(X_train, y_train)

## 7. Test the model

In [None]:
# Report metrics for training model
y_pred = pipeline.predict(X_train)
classification_report_output(y_train, y_pred)

In [None]:
# Report metrics for test model
y_pred = pipeline.predict(X_test)
classification_report_output(y_test, y_pred)

## 9. Export the model as a pickle file

In [None]:
# Export model
with open('adaboost_cv.pkl', 'wb') as file:
    pickle.dump(cv, file)