# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [44]:
### Downloading necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords'])
from nltk.corpus import stopwords

### Importing libraries
from workspace_utils import active_session
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
### Loading data from the database
querystring = """SELECT * from disaster_response_df"""
engine = create_engine('sqlite:///InsertDatabaseName.db')

df = pd.read_sql(querystring, engine)

### Dropping columns and converting to Numpy arrays
X = df[['message']].values
y = df.drop(['id', 'message', 'original', 'genre'], axis=1).values
#y['related'] = y['related'].map(lambda x: 1 if x ==2 else x)

In [46]:
df.to_csv('disaster_data.csv', index=False)

### 2. Write a tokenization function to process your text data

In [47]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
stop_words = nltk.corpus.stopwords.words("english")

In [48]:
def tokenize(text):
    
    ### Getting list of all urls using regex
    detected_urls = re.findall(url_regex, text)
    
    ### Replacing each url in text string with placeholder
    for url in detected_urls:
        bbtext = text.replace(url, "urlplaceholder")
    
    ### Tokenizing text
    tokens = word_tokenize(text)
    
    ### Initiating lemmatizer
    lemmatizer = WordNetLemmatizer()
        
    ### Lemmatizing, normalizing case, and removing leading/trailing white space and stop words
    return[lemmatizer.lemmatize(tok).lower().strip() for tok in tokens if tok not in stop_words]

In [49]:
### Tokenizing a row
s = str(X[1]).replace("[", "").replace("]", "").replace("'", "")
s

'Is the Hurricane over or is it not over'

In [50]:
### Testing the tokenization function
tokenize(s)

['is', 'hurricane']

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [51]:
### Importing packages
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

In [52]:
### Building the Pipeline
pipeline = Pipeline([('cvectorizer', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('rf_clf', RandomForestClassifier())
                    ])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
### Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Flattening arrays
X_train = X_train.ravel()
X_test = X_test.ravel()

In [55]:
### Fitting the model
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('cvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
      ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [59]:
from sklearn.metrics import classification_report, precision_score, recall_score

In [60]:
y_pred = pipeline.predict(X_test)
accuracy = (y_pred == y_test).mean() 
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.9244517670636297
Precision: 0.41621282539859356
Recall: 0.2467297984450422


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [61]:
label_columns = df.drop(['id', 'message', 'original', 'genre'], axis=1).columns.tolist()
print(classification_report(y_test, y_pred, target_names=label_columns))

                        precision    recall  f1-score   support

               related       0.77      0.84      0.81      6546
               request       0.40      0.07      0.12      1455
                 offer       0.00      0.00      0.00        34
           aid_related       0.43      0.15      0.22      3516
          medical_help       0.06      0.00      0.00       665
      medical_products       0.11      0.00      0.00       407
     search_and_rescue       0.33      0.00      0.01       234
              security       0.00      0.00      0.00       170
              military       0.50      0.00      0.01       283
           child_alone       0.00      0.00      0.00         0
                 water       0.00      0.00      0.00       522
                  food       0.16      0.01      0.01       909
               shelter       0.06      0.00      0.00       784
              clothing       0.00      0.00      0.00       136
                 money       0.14      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### 6. Improve your model
Use grid search to find better parameters. 

In [62]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [36]:
### Original Pipeline
pipeline = Pipeline([('cvectorizer', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('rf_clf', RandomForestClassifier())
                    ])

In [37]:
### Specifying parameters for grid search
parameters = {'rf_clf__min_samples_leaf':[1, 2],
              'rf_clf__min_samples_split': [2, 4],
              'rf_clf__n_estimators': [10, 30, 50]}
              

### Creating grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

In [38]:
### Fitting the new model
cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
      ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rf_clf__min_samples_leaf': [1, 2], 'rf_clf__min_samples_split': [2, 4], 'rf_clf__n_estimators': [10, 30, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [42]:
### Checking which parameter values optimize this model 
cv.best_params_

{'rf_clf__min_samples_leaf': 1,
 'rf_clf__min_samples_split': 2,
 'rf_clf__n_estimators': 50}

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [43]:
y_pred = cv.predict(X_test)
accuracy = (y_pred == y_test).mean() 
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.9263916188127079
Precision: 0.41643241958155297
Recall: 0.2619477504698036


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [22]:
print(classification_report(y_test, y_pred, target_names=label_columns))

                        precision    recall  f1-score   support

               related       0.77      0.94      0.84      6546
               request       0.41      0.05      0.08      1455
                 offer       0.00      0.00      0.00        34
           aid_related       0.45      0.11      0.18      3516
          medical_help       0.00      0.00      0.00       665
      medical_products       0.00      0.00      0.00       407
     search_and_rescue       0.50      0.00      0.01       234
              security       0.00      0.00      0.00       170
              military       0.50      0.00      0.01       283
           child_alone       0.00      0.00      0.00         0
                 water       0.00      0.00      0.00       522
                  food       0.21      0.00      0.01       909
               shelter       0.00      0.00      0.00       784
              clothing       0.00      0.00      0.00       136
                 money       0.20      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [65]:
### Original Pipeline
pipeline = Pipeline([('cvectorizer', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('rf_clf', RandomForestClassifier())
                    ])


### Specifying parameters for grid search
parameters2 = {'cvectorizer__ngram_range': ((1, 1), (1, 2)),
               'cvectorizer__max_df': (0.5, 0.75, 1.0),
               'cvectorizer__max_features': (None, 5000, 10000),
               'tfidf__use_idf': (True, False),
               'rf_clf__min_samples_leaf':[1, 2],
               'rf_clf__min_samples_split': [2, 4],
               'rf_clf__n_estimators': [10, 30, 50]}
              

### Creating a Randomized search object - Grid Search is taking way too long
#cv_final = GridSearchCV(pipeline, param_grid=parameters2)
cv_final = RandomizedSearchCV(pipeline, param_distributions=parameters2)

In [66]:
### Fitting the new model
with active_session():
    cv_final.fit(X_train, y_train)

In [67]:
### Checking which parameter values optimize this model 
cv_final.best_params_

{'tfidf__use_idf': False,
 'rf_clf__n_estimators': 50,
 'rf_clf__min_samples_split': 2,
 'rf_clf__min_samples_leaf': 1,
 'cvectorizer__ngram_range': (1, 2),
 'cvectorizer__max_features': None,
 'cvectorizer__max_df': 1.0}

In [68]:
### Testing the model
y_pred = cv_final.predict(X_test)
accuracy = (y_pred == y_test).mean() 
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

Accuracy: 0.9259162909006822
Precision: 0.4119280688760612
Recall: 0.23593352739599838


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [69]:
print(classification_report(y_test, y_pred, target_names=label_columns))

                        precision    recall  f1-score   support

               related       0.77      0.88      0.82      6546
               request       0.42      0.03      0.05      1455
                 offer       0.00      0.00      0.00        34
           aid_related       0.45      0.05      0.09      3516
          medical_help       0.00      0.00      0.00       665
      medical_products       0.00      0.00      0.00       407
     search_and_rescue       0.00      0.00      0.00       234
              security       0.00      0.00      0.00       170
              military       0.50      0.00      0.01       283
           child_alone       0.00      0.00      0.00         0
                 water       0.00      0.00      0.00       522
                  food       0.21      0.00      0.01       909
               shelter       0.08      0.00      0.00       784
              clothing       0.00      0.00      0.00       136
                 money       0.50      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### 9. Export your model as a pickle file

In [71]:
import joblib
joblib.dump(cv_final.best_estimator_, 'disaster_response_model.pkl')

['disaster_response_model.pkl']

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [24]:
### Downloading necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords'])
from nltk.corpus import stopwords

### Importing libraries
import sys
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Importing packages for the Machine Learning Model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

### Importing packages to save the model
import joblib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
stop_words = nltk.corpus.stopwords.words("english")

### Declaring the error message
error_message = '''
Please, provide the filepaths of the messages and categories datasets as the first and second arguments,
respectively, as well as the path of the database to save the cleansed data as the third argument.
Example: python process_data.py disaster_messages.csv disaster_categories.csv DisasterResponse.db
'''

In [28]:
def load_data():
    """
    Loads the data used in the model - messages.csv and categories.csv
    Both files are located in the home directory
    
    INPUT:  None
    
    OUTPUT: X  - Numpy array originated from the dataframe saved in the SQL server. It contains the predictor.
            y  - Numpy array originated from the dataframe saved in the SQL server. It contains the labels.
            df - The dataframe saved in the SQL server from process_data.py  
    """
    
    ### Declaring the query and the engine to connect to the database and pull the data from the previous step
    print('Loading data')
    querystring = """SELECT * from disaster_response_mod"""
    engine = create_engine('sqlite:///InsertDatabaseName.db')
    
    ### Connecting to the database and querying the data
    df = pd.read_sql(querystring, engine)

    ### Dropping columns and converting to Numpy arrays
    X = df[['message']].values
    y = df.drop(['id', 'message', 'original', 'genre'], axis=1).values
    print('Data is ready to be tokenized')
    
    return X, y, df

In [None]:
def tokenize(text):
    ''''
    Tokenizer that gets a string and converts it in tokens, to be used in the Machine Learning model
    
    INPUT:  text - The original text to be tokenized/lemmatized
    OUTPUT: Lemmatized and tokenized text
    '''
    
    ### Getting list of all urls using regex
    detected_urls = re.findall(url_regex, text)
    
    ### Replacing each url in text string with placeholder
    for url in detected_urls:
        bbtext = text.replace(url, "urlplaceholder")
    
    ### Tokenizing text
    tokens = word_tokenize(text)
    
    ### Initiating lemmatizer
    lemmatizer = WordNetLemmatizer()
        
    ### Lemmatizing, normalizing case, and removing leading/trailing white space and stop words
    print('Tokenizer function created')
    return[lemmatizer.lemmatize(tok).lower().strip() for tok in tokens if tok not in stop_words]

In [31]:
def build_model(X, y):
    
    '''
    Splits the data into training and testing sets, created the pipeline and finds the best combination of
    hyperparameters to optmize the model
    
    INPUT:  X  - Numpy array originated from the dataframe saved in the SQL server. It contains the predictor.
            y  - Numpy array originated from the dataframe saved in the SQL server. It contains the labels.
    
    OUTPUT: X_test - Testing data, after running train_test_split. To be used for evalutating the model.
            y_test - Teating labels, after running train_test_split. To be used for evaluating the model.
            cv     - The optimized model, after performing Grid Search
    
    '''
    
    ### Performing train test split
    print('Splitting the data')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    ### Flattening arrays
    X_train = X_train.ravel()
    X_test = X_test.ravel()
    
    ### Building the Pipeline
    print('Building the Pipeline')
    pipeline = Pipeline([('cvectorizer', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('rf_clf', RandomForestClassifier())
                        ])

    ### Specifying parameters for grid search
    parameters = {'cvectorizer__ngram_range': ((1, 1), (1, 2)),
                  'cvectorizer__max_df': (0.5, 0.75, 1.0),
                  'cvectorizer__max_features': (None, 5000, 10000),
                  'tfidf__use_idf': (True, False),
                  'rf_clf__min_samples_leaf':[1, 2],
                  'rf_clf__min_samples_split': [2, 4],
                  'rf_clf__n_estimators': [10, 30, 50]}
    
    ### Creating grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    ### Fitting the model
    print('Fitting the model')
    cv.fit(X_train, y_train)
    
    ### Printing best hyperparamaters and returning the model
    print('Printing best hyperparamaters and returning the model')
    print(cv_final.best_params_)
    
    return X_test, y_test, cv

In [32]:
def evaluate_model(model, X_test, y_test, df):
    
    """
    Loads the data used in the model - messages.csv and categories.csv
    Both files are located in the home directory
    
    INPUT: model  - The optimized model, with its hyperparameters optimized after Grid Search
           X_test - Testing data, after running train_test_split
           y_test - Teating labels, after running train_test_split
           df     - The dataframe saved in the SQL server from process_data.py
    
    OUTPUT: None
    """
    
    y_pred = model.predict(X_test)
    accuracy = (y_pred == y_test).mean() 
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

    label_columns = df.drop(['id', 'message', 'original', 'genre'], axis=1).columns.tolist()
    print(classification_report(y_test, y_pred, target_names=label_columns))

In [37]:
def save_model(model, model_path):
    
    """
    Saves the model to a pickle file
    
    INPUT:  model      - The optimized model, with its hyperparameters optimized after Grid Search
            model_path - The path where the pickle file will be saved
    
    OUTPUT: None
    """
    
    joblib.dump(model, model_path)

In [None]:
def main():
    if len(sys.argv) == 3:
        df1_path, df2_path, db_path = sys.argv[1:]
        
    print('Calling functions')
    X, y, df =  load_data()
    X_test, y_test, cv_final = build_model(X, y)
    evaluate_model(X_test, y_test, cv_final, df)
    save_model(cv_final.best_estimator_, 'disaster_response_model_final.pkl')
    print('End of code')
        
    else:
        print(error_message)
        
if __name__ == "__main__":
    main()