# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# download necessary NLTK data 
import nltk
nltk.download(['punkt', 'wordnet','stopwords'])
import sqlite3

# import libraries
import re 
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
def load_data(database_filepath):
    conn = sqlite3.connect(database_filepath)
    df = pd.read_sql("SELECT * from tidy_data", conn)
    conn.close()
    
    # define features and label arrays
    X = df.iloc[:,1]      #.values
    y = df.iloc[:,3:]     #.values
    category_names = list(df.iloc[:,3:].columns)

    return X, y, category_names

In [None]:
# Function from ETL script

def clean_data():
    # read in file
    messages = pd.read_csv('messages.csv')             
    categories = pd.read_csv('categories.csv')        
    
    # merge datasets
    df = messages.merge(categories, how='outer',on='id')
    
    # clean data
    # create a dataframe of the 36 individual category columns
    categories = df['categories'].str.split(pat=';',expand=True)
    
    # select the first row of the categories dataframe
    row = categories.iloc[0,]

    # extract a list of new column names for categories.
    category_colnames = list(map(lambda x: x[:-2], row))

    # rename the columns of `categories`
    categories.columns = category_colnames

    # set each value to be the last character of the string
    for column in categories:
        categories[column] =  categories[column].apply(lambda x: x[-1])
    
    # convert column from string to numeric
    categories[column] = pd.to_numeric(categories[column]) 

    # drop the original, categories columns
    df.drop(labels=['categories','original'], axis=1, inplace=True)
    
    # concatenate the original dataframe with the new `categories` dataframe
    df = pd.concat([df,categories], axis=1)

    # Converting category columns to numeric
    df.iloc[:,3:] = df.iloc[:,3:].apply(pd.to_numeric)

    # drop duplicates
    df.drop_duplicates(inplace=True)
    
    # removing rows labelled as 2
    df.drop(df[df['related']==2].index, inplace=True)

    # load to database
    engine = create_engine('sqlite:///disaster_response.db')
    df.to_sql('tidy_dataset', engine, if_exists='replace', index=False)

    # define features and label arrays
    X = df.iloc[:,1].values
    y = df.iloc[:,3:].values
    category_names = list(df.iloc[:,3:].columns)

    return X, y, category_names  

### 2. Write a tokenization function to process your text data

In [4]:
def tokenize(text):

    # Remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]"," ",text)
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # initiate stop words
    stop_words = stopwords.words("english")
    
    # remove stop words
    tokens = [t for t in tokens if t not in stop_words]
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [5]:
# testing the functions

test_X, test_y, test_labels = clean_data()

for message in test_X[:5]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')

Weather update - a cold front from Cuba that could pass over Haiti
['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti'] 

Is the Hurricane over or is it not over
['is', 'hurricane'] 

Looking for someone but no name
['looking', 'someone', 'name'] 

UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
['un', 'report', 'leogane', '80', '90', 'destroyed', 'only', 'hospital', 'st', 'croix', 'functioning', 'needs', 'supply', 'desperately'] 

says: west side of Haiti, rest of the country today and tonight
['say', 'west', 'side', 'haiti', 'rest', 'country', 'today', 'tonight'] 



### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [8]:
# Build a pipeline
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),('tfidf',TfidfTransformer()), 
                     ('clf',MultiOutputClassifier(RandomForestClassifier(class_weight='balanced',n_jobs=-1)))])  # classes are imbalanced

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [9]:
# Getting X, y from load_data()
X,y,y_labels = load_data()

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

# Train classifier
pipeline.fit(X_train,y_train)

# Predict on test data
y_pred = pipeline.predict(X_test)

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [10]:
for i in range(0,36):
    print('label:',y_labels[i])
    print(classification_report(y_test[:,i],y_pred[:,i]))

label: related
             precision    recall  f1-score   support

          0       0.65      0.51      0.57      1265
          1       0.85      0.91      0.88      3941

avg / total       0.80      0.81      0.81      5206

label: request
             precision    recall  f1-score   support

          0       0.90      0.98      0.94      4323
          1       0.81      0.47      0.60       883

avg / total       0.88      0.89      0.88      5206

label: offer
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5181
          1       0.00      0.00      0.00        25

avg / total       0.99      1.00      0.99      5206

label: aid_related
             precision    recall  f1-score   support

          0       0.77      0.84      0.80      3042
          1       0.74      0.64      0.69      2164

avg / total       0.76      0.76      0.76      5206

label: medical_help
             precision    recall  f1-score   support

    

  'precision', 'predicted', average, warn_for)


### 6. Improve your model
Use grid search to find better parameters. 

In [12]:
# Checking pipeline hyperparameters
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__bootstrap', 'clf__estimator__class_weight', 'clf__estimator__criterion', 'clf__estimator__max_depth', 'clf__estimator__max_features', 'clf__estimator__max_leaf_nodes', 'clf__estimator__min_impurity_decrease', 'clf__estimator__min_impurity_split', 'clf__estimator__min_samples_leaf', 'clf__estimator__min_samples_split', 'clf__estimator__min_weight_fraction_leaf', 'clf__estimator__n_estimators', 'clf__estimator__n_jobs', 'clf__estimator__oob_score', 'clf__estimator__random_state', 'clf__estimator__verbose', 'clf__estimator__

In [14]:
# Using grid search to find better parameters
parameters =  {'vect__max_df': (0.2,0.3,0.5),
               #'vect__ngram_range': ((1,1),(1,2),(2,2)),
               #'tfidf__use_idf': (True, False),
               'clf__estimator__max_depth': [3,4,5],      
               'clf__estimator__min_samples_split': [3,5,7]}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring = 'f1')

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [16]:
# Train grid search rf classifier
cv.fit(X_train,y_train)

# Predict on test data
y_pred_cv = cv.predict(X_test)

for i in range(0,36):
    print('label:',y_labels[i])
    print(classification_report(y_test[:,i],y_pred_cv[:,i]))

label: related
             precision    recall  f1-score   support

          0       0.25      1.00      0.40      1257
          1       0.97      0.04      0.07      3949

avg / total       0.79      0.27      0.15      5206

label: request
             precision    recall  f1-score   support

          0       0.84      0.99      0.91      4342
          1       0.53      0.08      0.14       864

avg / total       0.79      0.84      0.78      5206

label: offer
             precision    recall  f1-score   support

          0       0.99      1.00      1.00      5175
          1       0.05      0.03      0.04        31

avg / total       0.99      0.99      0.99      5206

label: aid_related
             precision    recall  f1-score   support

          0       0.62      0.98      0.76      3092
          1       0.81      0.11      0.19      2114

avg / total       0.70      0.63      0.53      5206

label: medical_help
             precision    recall  f1-score   support

    

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [11]:
# Build a pipeline using KNN classifier
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),('tfidf',TfidfTransformer()), 
                     ('knn',MultiOutputClassifier(KNeighborsClassifier(n_jobs=-1)))])  

In [12]:
# Checking pipeline hyperparameters
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'tfidf', 'knn', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'knn__estimator__algorithm', 'knn__estimator__leaf_size', 'knn__estimator__metric', 'knn__estimator__metric_params', 'knn__estimator__n_jobs', 'knn__estimator__n_neighbors', 'knn__estimator__p', 'knn__estimator__weights', 'knn__estimator', 'knn__n_jobs'])

In [None]:
# Using grid search to find better parameters
parameters =  {'vect__max_df': (0.2, 0.5, 1.0),
               #'vect__ngram_range': ((1,1),(1,2),(2,2)),
               #'tfidf__use_idf': (True, False),
               'knn__estimator__n_neighbors': [5,7,9],      
               'knn__estimator__p': [1,2]}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring = 'f1')


# Train grid search classifier
cv.fit(X_train,y_train)

# Predict on test data
y_pred_cv = cv.predict(X_test)

# Testing the model
for i in range(0,36):
    print('label:',y_labels[i])
    print(classification_report(y_test[:,i],y_pred_cv[:,i]))


### 9. Export your model as a pickle file

In [None]:
# save model
pickled_filename = 'trained_model.pkl'
pickle.dump(cv, open(pickled_filename, 'wb'))

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.