# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# download necessary NLTK data 
import nltk
nltk.download(['punkt', 'wordnet','stopwords'])
import sqlite3

# import libraries
import re 
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score                            
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
# load data from database
def load_data(database_filepath):
    
    '''
    Function that takes a table from database and provides array of messages and categories 
    
    Input: database_filepath: The path of sql database
    Output: X: Messages, y: Categories, category_names: Labels for categories
    
    '''
    
    conn = sqlite3.connect(database_filepath)
    df = pd.read_sql("SELECT * from tidy_dataset", conn)
    conn.close()
    
    # define features and label arrays
    X = df.iloc[:,1].values
    y = df.iloc[:,3:].values
    category_names = list(df.iloc[:,3:].columns)

    return X, y, category_names

In [24]:
# Function from ETL script

def clean_data():
    
    '''
    Function that reads messages and categories files, merges and cleans the data and loads it to sql database
    
    Input: -
    Output:-
    
    '''
    
    # read in file
    messages = pd.read_csv('messages.csv')             
    categories = pd.read_csv('categories.csv')        
    
     # merge datasets
    df = messages.merge(categories, how='outer',on='id')
    
    # create a dataframe of the 36 individual category columns
    categories = df['categories'].str.split(pat=';',expand=True)
    
    # select the first row of the categories dataframe
    row = categories.iloc[0,]

    # extract a list of new column names for categories.
    category_colnames = list(map(lambda x: x[:-2], row))

    # rename the columns of `categories`
    categories.columns = category_colnames

    # set each value to be the last character of the string
    for column in categories:
        categories[column] =  categories[column].apply(lambda x: x[-1])
    
    # convert column from string to numeric
    categories[column] = pd.to_numeric(categories[column]) 

    # drop the original, categories column from `df`
    df.drop(labels=['categories','original'], axis=1, inplace=True)
    
    # concatenate the original dataframe with the new `categories` dataframe
    df = pd.concat([df,categories], axis=1)

    # Converting category columns to numeric
    df.iloc[:,3:] = df.iloc[:,3:].apply(pd.to_numeric)

    # drop duplicates
    df.drop_duplicates(inplace=True)
    
    # removing rows labelled as 2
    df.drop(df[df['related']==2].index, inplace=True)

    # load to database
    engine = create_engine('sqlite:///disaster_response.db')
    df.to_sql('tidy_dataset', engine, if_exists='replace', index=False)


### 2. Write a tokenization function to process your text data

In [25]:
def tokenize(text):    # This function will be used in the pipeline

    '''
    Function that takes a text, cleans and lemmatizes it and returns clean tokens
    
    Input: text: array of messages
    Output: clean tokens : clean and lemmatized tokens
    
    '''
    
    # Remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]"," ",text)
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # initiate stop words
    stop_words = stopwords.words("english")
    
    # remove stop words
    tokens = [t for t in tokens if t not in stop_words]
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:   
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [26]:
# testing load_data and tokenize functions

test_X, test_y, test_labels = load_data('disaster_response.db')

for message in test_X[:5]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')

Weather update - a cold front from Cuba that could pass over Haiti
['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti'] 

Is the Hurricane over or is it not over
['is', 'hurricane'] 

Looking for someone but no name
['looking', 'someone', 'name'] 

UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
['un', 'report', 'leogane', '80', '90', 'destroyed', 'only', 'hospital', 'st', 'croix', 'functioning', 'needs', 'supply', 'desperately'] 

says: west side of Haiti, rest of the country today and tonight
['say', 'west', 'side', 'haiti', 'rest', 'country', 'today', 'tonight'] 



### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [48]:
# Build a pipeline, noted classes are imbalanced, used n_jobs = -1 to improve processing speeds
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),('tfidf',TfidfTransformer()), 
                     ('clf',MultiOutputClassifier(RandomForestClassifier(class_weight='balanced',n_jobs=-1)))])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [49]:
# Getting X, y from load_data()
X,y,category_names = load_data('disaster_response.db')

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [50]:
# Train classifier
pipeline.fit(X_train,y_train)

# Predict on test data
y_pred = pipeline.predict(X_test)

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [51]:
# Evaluating the model
class_report = classification_report(y_test, y_pred, target_names=category_names)
print(class_report)

                        precision    recall  f1-score   support

               related       0.86      0.94      0.90      4004
               request       0.80      0.51      0.63       878
                 offer       0.00      0.00      0.00        23
           aid_related       0.75      0.72      0.74      2164
          medical_help       0.77      0.10      0.17       424
      medical_products       0.78      0.09      0.15       244
     search_and_rescue       1.00      0.01      0.03       140
              security       0.00      0.00      0.00        95
              military       0.33      0.02      0.05       163
           child_alone       0.00      0.00      0.00         0
                 water       0.77      0.34      0.47       323
                  food       0.85      0.51      0.64       556
               shelter       0.87      0.28      0.42       452
              clothing       0.86      0.07      0.13        83
                 money       0.80      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Improve your model
Use grid search to find better parameters. 

In [59]:
# Checking pipeline hyperparameters
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__bootstrap', 'clf__estimator__ccp_alpha', 'clf__estimator__class_weight', 'clf__estimator__criterion', 'clf__estimator__max_depth', 'clf__estimator__max_features', 'clf__estimator__max_leaf_nodes', 'clf__estimator__max_samples', 'clf__estimator__min_impurity_decrease', 'clf__estimator__min_impurity_split', 'clf__estimator__min_samples_leaf', 'clf__estimator__min_samples_split', 'clf__estimator__min_weight_fraction_leaf', 'clf__estimator__n_estimators', 'clf__estimator__n_jobs', 'clf__estimator__oob_score', 'clf_

In [63]:
# hyperparameter tuning, using f1 score as scoring method   
parameters =  {'clf__estimator__max_depth': [3,4,5],
               'clf__estimator__min_samples_split': [3,5,7]}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring = 'f1_micro')

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [64]:
# Train grid search classifier
cv.fit(X_train,y_train)

# Predict on test data
y_pred = cv.predict(X_test)

# Evaluate the model
class_report = classification_report(y_test, y_pred, target_names=category_names)
print(class_report)

                        precision    recall  f1-score   support

               related       0.92      0.66      0.77      4004
               request       0.54      0.72      0.61       878
                 offer       0.00      0.00      0.00        23
           aid_related       0.72      0.63      0.67      2164
          medical_help       0.38      0.54      0.44       424
      medical_products       0.25      0.56      0.35       244
     search_and_rescue       0.20      0.44      0.27       140
              security       0.09      0.21      0.12        95
              military       0.33      0.71      0.45       163
           child_alone       0.00      0.00      0.00         0
                 water       0.39      0.78      0.52       323
                  food       0.53      0.77      0.62       556
               shelter       0.39      0.72      0.50       452
              clothing       0.24      0.48      0.32        83
                 money       0.22      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [65]:
# Build a pipeline using KNN classifier
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),('tfidf',TfidfTransformer()), 
                     ('knn',MultiOutputClassifier(KNeighborsClassifier(n_jobs=-1)))])  

In [66]:
# Checking pipeline hyperparameters
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'knn', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'knn__estimator__algorithm', 'knn__estimator__leaf_size', 'knn__estimator__metric', 'knn__estimator__metric_params', 'knn__estimator__n_jobs', 'knn__estimator__n_neighbors', 'knn__estimator__p', 'knn__estimator__weights', 'knn__estimator', 'knn__n_jobs'])

In [67]:
# Using grid search to find better parameters
parameters =  {'knn__estimator__n_neighbors': [3,5,7],      
               'knn__estimator__p': [1,2]}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring = 'f1_micro')


# Train grid search classifier
cv.fit(X_train,y_train)

# Predict on test data
y_pred = cv.predict(X_test)

# Evaluating the model
class_report = classification_report(y_test, y_pred, target_names=category_names)
print(class_report)

                        precision    recall  f1-score   support

               related       0.83      0.93      0.88      4004
               request       0.74      0.46      0.56       878
                 offer       0.00      0.00      0.00        23
           aid_related       0.73      0.46      0.56      2164
          medical_help       0.62      0.07      0.12       424
      medical_products       0.69      0.11      0.19       244
     search_and_rescue       0.62      0.04      0.07       140
              security       0.00      0.00      0.00        95
              military       0.77      0.10      0.18       163
           child_alone       0.00      0.00      0.00         0
                 water       0.67      0.19      0.29       323
                  food       0.72      0.29      0.42       556
               shelter       0.70      0.18      0.28       452
              clothing       0.71      0.14      0.24        83
                 money       0.71      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 9. Export your model as a pickle file

In [None]:
# save model  
pickled_filename = 'trained_model.pkl'
pickle.dump(cv, open(pickled_filename, 'wb'))

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [None]:
# Load data and split into train/test sets
X, y, test_labels = load_data('disaster_response.db')

def build_model():
    
    '''
    Function that uses a ML pipeline and grid search to return the best model 
    
    Input: -
    Output: model: best classification model
    
    '''
    
    # Build a pipeline, Note: classes are imbalanced
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),('tfidf',TfidfTransformer()), 
                     ('clf',MultiOutputClassifier(RandomForestClassifier(class_weight='balanced',n_jobs=-1)))])  
    
    # Using grid search to find better parameters   
    parameters =  {'clf__estimator__max_depth': [3,4,5],          
               'clf__estimator__min_samples_split': [3,5,7]}     
    
    # Create grid search object
    model = GridSearchCV(pipeline, param_grid=parameters ,scoring='f1_micro')
        
    return model

def evaluate_model(model, X_test, y_test, category_names):  
    
    '''
    Function that takes the model, X_test, y_test, and category names to evaluate the model and print classification report 
    
    Input: model: best model from build_model(), X_test: testing set, y_test: test set categories, category_names: labels for categories
    Output: classification report: classification report for y_test vs predicted values 
    
    '''
    
    # Predict on test data
    y_pred = model.predict(X_test)
   
    class_report = classification_report(y_test, y_pred, target_names=category_names)
    print(class_report)

def save_model(model, model_filepath):     # Saving pickled file
    pickle.dump(model, open(model_filepath, 'wb'))
