# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [70]:
# import libraries
from sqlalchemy import create_engine
import sqlalchemy as db
import pandas as pd
import numpy as np 
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
import re
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline, FeatureUnion 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.multioutput import MultiOutputClassifier
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger','stopwords']) 
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from nltk.stem.porter import PorterStemmer 
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
import sqlite3

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [116]:
conn = sqlite3.connect('data/DisasterResponse.db')
res = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
for name in res:
    print (name[0])

messages


In [138]:
# load data from database
engine1 = create_engine('sqlite:///'+'data/DisasterResponse.db')
df1=pd.read_sql('select * from messages;', conn)
X1 = df.message.values
Y1 = df.iloc[:,4:-1].values
category_name1=list(df.columns[4:-1])

In [139]:
X1.shape

(26028,)

In [140]:
Y1.shape

(26028, 34)

In [141]:
df1=df1.drop(['id','message','original'],axis=1)
df1.head()

Unnamed: 0,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,direct,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,direct,1,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,direct,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,direct,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,direct,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
df2=df1.groupby('genre').sum().reset_index()
df2.head()

Unnamed: 0,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,direct,7182,3696,46,4338,592,471,216,131,46,...,78,186,1521,304,315,41,796,63,207,3613
1,news,10671,604,65,5860,1415,793,441,292,801,...,218,866,4280,1747,1445,225,910,415,1052,852
2,social,2053,174,7,662,77,49,67,48,13,...,13,99,1496,104,683,16,749,52,117,610


In [143]:
df2=df2.melt(id_vars=['genre'], var_name='types', value_name='count')


Unnamed: 0,genre,types,count
0,direct,related,7182
1,news,related,10671
2,social,related,2053
3,direct,request,3696
4,news,request,604


In [145]:
df2

Unnamed: 0,genre,types,count
0,direct,related,7182
1,news,related,10671
2,social,related,2053
3,direct,request,3696
4,news,request,604
5,social,request,174
6,direct,offer,46
7,news,offer,65
8,social,offer,7
9,direct,aid_related,4338


In [102]:
test=df1.sum()
test=pd.DataFrame(test,index=None)
print (type(test))

<class 'pandas.core.frame.DataFrame'>


In [113]:
test=test.reset_index()

In [114]:
test.columns

Index(['index', 0], dtype='object')

In [115]:
test.iloc[:,0]

0     related               
1     request               
2     offer                 
3     aid_related           
4     medical_help          
5     medical_products      
6     search_and_rescue     
7     security              
8     military              
9     water                 
10    food                  
11    shelter               
12    clothing              
13    money                 
14    missing_people        
15    refugees              
16    death                 
17    other_aid             
18    infrastructure_related
19    transport             
20    buildings             
21    electricity           
22    tools                 
23    hospitals             
24    shops                 
25    aid_centers           
26    other_infrastructure  
27    weather_related       
28    floods                
29    storm                 
30    fire                  
31    earthquake            
32    cold                  
33    other_weather         
34    direct_r

# 

In [2]:
# load data from database
engine = create_engine('sqlite:///'+'DisasterResponse.db')
df=pd.read_sql("messages", engine)
X = df.message.values
Y = df.iloc[:,4:-1].values
category_name=list(df.columns[4:-1])

In [3]:
df.shape

(26028, 39)

In [4]:
Y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

### 2. Write a tokenization function to process your text data

In [5]:
def tokenize(text):
    text=re.sub(r"[^a-zA-Z0-9]"," ",text)
    words=word_tokenize(text)
    words=[w.lower().strip() for w in words if w not in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed = [PorterStemmer().stem(w) for w in words] 
    return stemmed

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [6]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)), 
    ('tfidf', TfidfTransformer()), 
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [7]:
 # Understand the values in 0 and 1
df.iloc[:,4:-1].apply(pd.value_counts)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather
0,6122,21554,25910,15168,23944,24715,25304,25557,25168,24356,...,25908,25719,24877,18731,23873,23585,25746,23573,25498,24652
1,19906,4474,118,10860,2084,1313,724,471,860,1672,...,120,309,1151,7297,2155,2443,282,2455,530,1376


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
pipeline.fit(X_train,y_train) 

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [11]:
y_train_predicted = pipeline.predict(X_train) 

In [79]:
metrics=[]
for i in range(len(category_name)):
    precision=precision_score(y_train[:,i],y_train_predicted[:,i])
    recall=recall_score(y_train[:,i],y_train_predicted[:,i])
    accuracy=accuracy_score(y_train[:,i],y_train_predicted[:,i])
    f1=f1_score(list(y_train[:,i]),list(y_train_predicted[:,i]))
    
    metrics.append([precision,recall,accuracy,f1])
    
# Metrics output
metrics_output=pd.DataFrame(data=metrics,columns=['Precision','Recall','Accuracy','f1'],index = category_name)
print (metrics_output)

                        Precision    Recall  Accuracy        f1
related                 0.998990   0.997541  0.997359  0.998265
request                 0.998815   0.995277  0.998799  0.997043
offer                   1.000000   0.988636  0.999952  0.994286
aid_related             0.998455   0.997743  0.998463  0.998099
medical_help            1.000000   0.992000  0.999424  0.995984
medical_products        0.998948   0.994764  0.999712  0.996852
search_and_rescue       1.000000   0.990809  0.999760  0.995383
security                1.000000   0.976048  0.999616  0.987879
military                1.000000   0.992754  0.999856  0.996364
water                   1.000000   0.999249  0.999952  0.999624
food                    1.000000   0.998779  0.999856  0.999389
shelter                 1.000000   0.998353  0.999856  0.999176
clothing                1.000000   0.997041  0.999952  0.998519
money                   1.000000   0.993840  0.999856  0.996910
missing_people          1.000000   0.995

In [80]:
metrics_output.median()

Precision    1.000000
Recall       0.995446
Accuracy     0.999832
f1           0.997622
dtype: float64

In [81]:
metrics_output.mean()

Precision    0.999375
Recall       0.994441
Accuracy     0.999604
f1           0.996895
dtype: float64

In [82]:
# Test the model on the test dataset

In [83]:
y_test_predicted = pipeline.predict(X_test) 

In [84]:
metrics_test=[]
for i in range(len(category_name)):
    precision=precision_score(y_test[:,i],y_test_predicted[:,i])
    recall=recall_score(y_test[:,i],y_test_predicted[:,i])
    accuracy=accuracy_score(y_test[:,i],y_test_predicted[:,i])
    f1=f1_score(list(y_test[:,i]),list(y_test_predicted[:,i]))
    
    metrics_test.append([precision,recall,accuracy,f1])
    
# Metrics output
metrics_output_test=pd.DataFrame(data=metrics_test,columns=['Precision','Recall','Accuracy','f1'],index = category_name)
print (metrics_output_test)

  _warn_prf(average, modifier, msg_start, len(result))


                        Precision    Recall  Accuracy        f1
related                 0.805261   0.976026  0.797925  0.882458
request                 0.370370   0.041841  0.952747  0.075188
offer                   0.000000   0.000000  0.994237  0.000000
aid_related             0.710069   0.670217  0.717057  0.689568
medical_help            0.700000   0.083904  0.893200  0.149847
medical_products        0.800000   0.033520  0.932962  0.064343
search_and_rescue       0.722222   0.072222  0.966961  0.131313
security                0.000000   0.000000  0.973300  0.000000
military                0.700000   0.031390  0.915866  0.060086
water                   0.731343   0.144118  0.940645  0.240786
food                    0.684211   0.335484  0.926815  0.450216
shelter                 0.785714   0.290650  0.925471  0.424332
clothing                1.000000   0.014925  0.987322  0.029412
money                   0.000000   0.000000  0.977526  0.000000
missing_people          0.000000   0.000

In [85]:
metrics_output_test.median()

Precision    0.700000
Recall       0.037680
Accuracy     0.948425
f1           0.069766
dtype: float64

In [86]:
metrics_output_test.mean()

Precision    0.500353
Recall       0.151043
Accuracy     0.942736
f1           0.190174
dtype: float64

### 6. Improve your model
Use grid search to find better parameters. 

In [12]:
pipeline.get_params()

{'clf': MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                        ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='auto',
                                                        max_leaf_nodes=None,
                                                        max_samples=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0

In [13]:
parameters = {
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__estimator__n_estimators': [10,50,100],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }

In [14]:
def performance_metric(y_test,y_pred):

    score_list=[]
    for i in range(np.shape(y_test)[1]):
        f1=f1_score(y_true=list(y_test[:,i]),y_pred=list(y_pred[:,i]))
        score_list.append(f1)
    score=np.median(score_list)
    
    return score

In [15]:
f1_scorer = make_scorer(performance_metric)

In [16]:
cv = GridSearchCV(pipeline, param_grid=parameters,scoring=f1_scorer,verbose=10)
#cv = GridSearchCV(pipeline, param_grid = parameters, scoring = scorer, verbose = 10)

In [17]:
result=cv.fit(X_train, y_train)
#model.fit(X_train, Y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True, score=0.083, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True, score=0.103, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.7min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True, score=0.103, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.0min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True, score=0.134, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  9.3min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=True, score=0.090, total= 2.4min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 11.6min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False, score=0.080, total= 2.4min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 14.0min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False, score=0.125, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 16.3min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False, score=0.134, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 18.6min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False, score=0.092, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 20.9min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l1, tfidf__use_idf=False, score=0.111, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=True, score=0.114, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=True, score=0.132, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=True, score=0.139, total= 2.3min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_i

[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=True, score=0.081, total= 6.5min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=True, score=0.091, total= 6.5min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=True, score=0.067, total= 6.5min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=False, score=0.071, total= 6.5min
[CV] clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, tfidf__norm=l1, tfi

[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=False, score=0.121, total= 2.3min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=False, score=0.162, total= 2.3min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=False, score=0.177, total= 2.3min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf__use_idf=False, score=0.112, total= 2.3min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, tfidf__norm=l2, tfidf_

[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=False, score=0.087, total= 5.9min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l1, tfidf__use_idf=False, score=0.091, total= 6.0min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=True, score=0.092, total= 5.9min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=True, score=0.112, total= 6.0min
[CV] clf__estimator__min_samples_split=3, clf__estimator__n_estimators=100, tfidf__norm=l2, tf

[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_idf=True, score=0.101, total= 4.0min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_idf=True, score=0.110, total= 3.9min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_idf=True, score=0.105, total= 3.8min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_idf=True 
[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_idf=True, score=0.086, total= 3.9min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, tfidf__norm=l1, tfidf__use_id

[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=True, score=0.064, total= 5.9min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=False, score=0.092, total= 5.8min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=False, score=0.106, total= 5.8min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=False 
[CV]  clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2, tfidf__use_idf=False, score=0.098, total= 6.3min
[CV] clf__estimator__min_samples_split=4, clf__estimator__n_estimators=100, tfidf__norm=l2,

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 752.7min finished


In [18]:
result.cv_results_

{'mean_fit_time': array([114.48539462, 115.25527987, 113.79352508, 114.47793746,
        216.01555858, 221.40233474, 217.51725178, 219.89101772,
        346.00540962, 350.91538811, 346.22162166, 351.35822139,
        111.79588161, 111.72791634, 111.43988595, 112.15251975,
        202.27888489, 202.08092828, 202.40353832, 202.76309614,
        316.08276873, 314.53372769, 315.08581562, 316.4178896 ,
        111.98209128, 111.25440583, 162.34689102, 123.29470859,
        199.58491564, 208.24108105, 206.81913176, 201.04706526,
        318.61726742, 304.16316924, 310.45060101, 313.79158854]),
 'mean_score_time': array([24.87143817, 24.38471117, 24.44873095, 24.13946695, 32.28621159,
        32.01074753, 32.65271969, 32.32181864, 42.40457387, 41.85908804,
        42.57824063, 41.87049084, 24.44411626, 24.38904352, 24.74674668,
        24.4685349 , 32.50605874, 32.23182397, 32.43457727, 32.21509986,
        42.21142168, 41.83849673, 42.54568176, 42.12073712, 24.85778961,
        24.76900024, 

In [19]:
# Best mean test score
np.max(result.cv_results_['mean_test_score'])

0.17008708900962954

In [20]:
# Parameters for best mean test score
result.best_params_

{'clf__estimator__min_samples_split': 3,
 'clf__estimator__n_estimators': 10,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [21]:
# Predicted output based on test dataset
y_pred = result.predict(X_test)
print(y_pred)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 1 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [22]:
# Calculate performance metrics for each category
metrics=[]
for i in range(len(category_name)):
    
    precision=precision_score(list(y_test[:,i]),list(y_pred[:,i]))
    recall=recall_score(y_test[:,i],y_pred[:,i])
    accuracy=accuracy_score(y_test[:,i],y_pred[:,i])
    f1=f1_score(y_test[:,i],y_pred[:,i])
    metrics.append([precision,recall,accuracy,f1])

# Metrics output
metrics_output_test_2=pd.DataFrame(data=metrics,columns=['Precision','Recall','Accuracy','f1'],index = category_name)
print (metrics_output_test_2)

  _warn_prf(average, modifier, msg_start, len(result))


                        Precision    Recall  Accuracy        f1
related                 0.837000   0.936919  0.812332  0.884146
request                 0.784200   0.466743  0.889166  0.585191
offer                   0.000000   0.000000  0.995390  0.000000
aid_related             0.723996   0.640839  0.745678  0.679884
medical_help            0.597403   0.103604  0.917595  0.176583
medical_products        0.729730   0.097473  0.950058  0.171975
search_and_rescue       0.619048   0.078313  0.969074  0.139037
security                0.000000   0.000000  0.980983  0.000000
military                0.312500   0.030303  0.967153  0.055249
water                   0.833333   0.439883  0.957549  0.575816
food                    0.802360   0.468966  0.927968  0.591948
shelter                 0.720238   0.256900  0.923742  0.378717
clothing                0.928571   0.191176  0.989243  0.317073
money                   1.000000   0.026087  0.978486  0.050847
missing_people          0.000000   0.000

In [38]:
metrics_output_test_2.describe()

Unnamed: 0,Precision,Recall,Accuracy,f1
count,34.0,34.0,34.0,34.0
mean,0.583318,0.201939,0.945408,0.257017
std,0.321072,0.255059,0.054805,0.274885
min,0.0,0.0,0.745678,0.0
25%,0.458333,0.016512,0.931041,0.032021
50%,0.654351,0.089926,0.959278,0.154531
75%,0.836084,0.394137,0.982424,0.526541
max,1.0,0.936919,0.99539,0.884146


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In stead of using Random Forest Tree, SVM might be a good approach since it leverages the kernel trick to transform your data and then based on these transformations, it finds an optimal boundary between the possible outputs, and Linear Support Vector Machine is widely regarded as one of the best text classification algorithms. (https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568)

In [27]:
pipeline2 = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(LinearSVC()))
])

# Get the parameters name
pipeline2.get_params()

{'clf': MultiOutputClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                           fit_intercept=True,
                                           intercept_scaling=1,
                                           loss='squared_hinge', max_iter=1000,
                                           multi_class='ovr', penalty='l2',
                                           random_state=None, tol=0.0001,
                                           verbose=0),
                       n_jobs=None),
 'clf__estimator': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0),
 'clf__estimator__C': 1.0,
 'clf__estimator__class_weight': None,
 'clf__estimator__dual': True,
 'clf__estimator__fit_intercept': True,
 'clf__estimator__intercept_scaling': 1,
 'clf__estimator__loss': 'squared

In [28]:
#
parameters_2 = {
                'clf__estimator__class_weight':['balanced'],
              'clf__estimator__max_iter': [1000,1500,2000],
              'clf__estimator__C':[1,1.5,2]}


In [29]:
# Define performance metric for use in grid search scoring object
def performance_metric(y_true, y_pred):
    """Calculate median F1 score for all of the output classifiers
    
    Args:
    y_true: array. Array containing actual labels.
    y_pred: array. Array containing predicted labels.
        
    Returns:
    score: float. Median F1 score for all of the output classifiers
    """
    f1_list = []
    for i in range(np.shape(y_pred)[1]):
        f1 = f1_score(np.array(y_true)[:, i], y_pred[:, i])
        f1_list.append(f1)
        
    score = np.median(f1_list)
    return score

In [30]:
f1_scorer = make_scorer(performance_metric)

In [31]:
cv_2 = GridSearchCV(pipeline2, param_grid=parameters_2,scoring=f1_scorer,verbose=10)
# Find best parameters
tuned_model2 = cv_2.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.447, total= 2.9min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.9min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.424, total= 2.8min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.7min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.453, total= 2.6min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  8.3min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.421, total= 2.2min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 10.5min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.411, total= 2.1min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.6min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.447, total= 2.1min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 14.7min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.424, total= 2.2min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 16.9min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.453, total= 2.2min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 19.1min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.421, total= 2.2min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 21.2min remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.411, total= 2.1min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000 
[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000, score=0.447, total= 2.1min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000 
[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000, score=0.424, total= 2.1min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000 
[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000, score=0.453, total= 2.1min
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000 
[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, clf__estimator__max_iter=2000, score=0.421, total= 2.1



[CV]  clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.422, total= 2.1min
[CV] clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 




[CV]  clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.434, total= 2.1min
[CV] clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 




[CV]  clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.423, total= 2.1min
[CV] clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 
[CV]  clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.406, total= 2.1min
[CV] clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 
[CV]  clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.439, total= 2.1min
[CV] clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 
[CV]  clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.422, total= 2.1min
[CV] clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 
[CV]  clf__estimator__C=1.5, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score



[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.418, total= 2.2min
[CV] clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 




[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.424, total= 2.2min
[CV] clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 




[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.412, total= 2.2min
[CV] clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000 




[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1000, score=0.400, total= 2.1min
[CV] clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 
[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.431, total= 2.2min
[CV] clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 
[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.418, total= 2.1min
[CV] clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 
[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.424, total= 2.1min
[CV] clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500 
[CV]  clf__estimator__C=2, clf__estimator__class_weight=balanced, clf__estimator__max_iter=1500, score=0.412, total= 2.2

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 97.8min finished


In [34]:
# Get results of grid search
tuned_model2.cv_results_

{'mean_fit_time': array([124.7038785 , 104.92378936, 103.28324022, 103.44506936,
        102.54043016, 104.06883798, 105.74800549, 105.25830789,
        104.69944363]),
 'mean_score_time': array([26.36965842, 24.15577803, 23.76356578, 23.30771747, 23.2612699 ,
        23.37091317, 23.47102389, 23.54595919, 23.33903775]),
 'mean_test_score': array([0.43121831, 0.43121831, 0.43121831, 0.42475608, 0.42475608,
        0.42475608, 0.41711915, 0.41711915, 0.41711915]),
 'param_clf__estimator__C': masked_array(data=[1, 1, 1, 1.5, 1.5, 1.5, 2, 2, 2],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_clf__estimator__class_weight': masked_array(data=['balanced', 'balanced', 'balanced', 'balanced',
                    'balanced', 'balanced', 'balanced', 'balanced',
                    'balanced'],
              mask=[False, False, False, False, False, False, False, False,
            

In [35]:
# Get the predicted result on test dataset
y_test_predicted_2=tuned_model2.predict(X_test)

In [36]:
# Calculate performance metrics for each category
metrics=[]
for i in range(len(category_name)):
    
    precision=precision_score(list(y_test[:,i]),list(y_test_predicted_2[:,i]))
    recall=recall_score(y_test[:,i],y_test_predicted_2[:,i])
    accuracy=accuracy_score(y_test[:,i],y_test_predicted_2[:,i])
    f1=f1_score(y_test[:,i],y_test_predicted_2[:,i])
    metrics.append([precision,recall,accuracy,f1])

# Metrics output
metrics_output_test_3=pd.DataFrame(data=metrics,columns=['Precision','Recall','Accuracy','f1'],index = category_name)
print (metrics_output_test_3)

                        Precision    Recall  Accuracy        f1
related                 0.895806   0.842674  0.804841  0.868428
request                 0.576887   0.709862  0.864195  0.636504
offer                   0.058824   0.083333  0.989627  0.068966
aid_related             0.716990   0.725160  0.763542  0.721051
medical_help            0.368254   0.522523  0.882828  0.432030
medical_products        0.351621   0.509025  0.923934  0.415929
search_and_rescue       0.244019   0.307229  0.947561  0.272000
security                0.064815   0.071429  0.963119  0.067961
military                0.453552   0.503030  0.965040  0.477011
water                   0.644769   0.777126  0.957357  0.704787
food                    0.696562   0.803448  0.939109  0.746197
shelter                 0.563725   0.732484  0.924510  0.637119
clothing                0.506667   0.558824  0.987130  0.531469
money                   0.337278   0.495652  0.967345  0.401408
missing_people          0.237288   0.200

In [39]:
metrics_output_test_3.describe()

Unnamed: 0,Precision,Recall,Accuracy,f1
count,34.0,34.0,34.0,34.0
mean,0.401181,0.478058,0.929555,0.431673
std,0.226362,0.240146,0.060386,0.231525
min,0.0,0.0,0.763542,0.0
25%,0.224763,0.319197,0.904725,0.263749
50%,0.362093,0.497826,0.948233,0.41109
75%,0.564965,0.70034,0.975653,0.630878
max,0.895806,0.842674,0.993853,0.868428


As we could see, the model is improved based on the median f1 value. Let's try Adaboost, since it is best used to boost the performance of decision trees on binary classification problems.

In [50]:
pipeline3 = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier()))
])

# Get the parameters name
#pipeline3.get_params()
parameters3 = {
        'vect__min_df':[1,10,50],
        'clf__estimator__learning_rate': [0.001, 0.01, 0.1]
    }

In [52]:
f1_scorer = make_scorer(performance_metric)
cv_3 = GridSearchCV(pipeline3, param_grid=parameters3,scoring=f1_scorer,verbose=10)
# Find best parameters
tuned_model3 = cv_3.fit(X_train, y_train)
tuned_model3.cv_results_
# Get the predicted result on test dataset
y_test_predicted_3=tuned_model3.predict(X_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] clf__estimator__learning_rate=0.001, vect__min_df=1 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=1, score=0.120, total= 3.2min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=1 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.2min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=1, score=0.193, total= 3.9min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=1 .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.2min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=1, score=0.118, total= 4.0min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=1 .............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 11.1min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=1, score=0.224, total= 3.8min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=1 .............


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 15.0min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=1, score=0.219, total= 3.8min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=10 ............


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 18.8min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=10, score=0.174, total= 3.5min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=10 ............


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 22.3min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=10, score=0.205, total= 3.6min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=10 ............


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 25.9min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=10, score=0.118, total= 3.6min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=10 ............


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 29.4min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=10, score=0.223, total= 3.6min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=10 ............


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 33.0min remaining:    0.0s


[CV]  clf__estimator__learning_rate=0.001, vect__min_df=10, score=0.215, total= 3.7min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=50 ............
[CV]  clf__estimator__learning_rate=0.001, vect__min_df=50, score=0.174, total= 3.5min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=50 ............
[CV]  clf__estimator__learning_rate=0.001, vect__min_df=50, score=0.199, total= 3.5min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=50 ............
[CV]  clf__estimator__learning_rate=0.001, vect__min_df=50, score=0.118, total= 3.4min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=50 ............
[CV]  clf__estimator__learning_rate=0.001, vect__min_df=50, score=0.222, total= 3.4min
[CV] clf__estimator__learning_rate=0.001, vect__min_df=50 ............
[CV]  clf__estimator__learning_rate=0.001, vect__min_df=50, score=0.217, total= 3.5min
[CV] clf__estimator__learning_rate=0.01, vect__min_df=1 ..............
[CV]  clf__estimator__learning_rate=0.01, vect__min_

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 144.9min finished


In [57]:
# Calculate performance metrics for each category
metrics=[]
for i in range(len(category_name)):
    
    precision=precision_score(list(y_test[:,i]),list(y_test_predicted_3[:,i]))
    recall=recall_score(y_test[:,i],y_test_predicted_3[:,i])
    accuracy=accuracy_score(y_test[:,i],y_test_predicted_3[:,i])
    f1=f1_score(y_test[:,i],y_test_predicted_3[:,i])
    metrics.append([precision,recall,accuracy,f1])

# Metrics output
metrics_output_test_4=pd.DataFrame(data=metrics,columns=['Precision','Recall','Accuracy','f1'],index = category_name)
print (metrics_output_test_4)

  _warn_prf(average, modifier, msg_start, len(result))


                        Precision    Recall  Accuracy        f1
related                 0.764310   1.000000  0.764310  0.866413
request                 0.790909   0.099771  0.844794  0.177189
offer                   0.000000   0.000000  0.995390  0.000000
aid_related             0.777154   0.189152  0.635421  0.304252
medical_help            0.701149   0.137387  0.921437  0.229755
medical_products        1.000000   0.003610  0.946984  0.007194
search_and_rescue       0.566038   0.180723  0.969458  0.273973
security                0.000000   0.000000  0.981176  0.000000
military                0.522727   0.139394  0.968690  0.220096
water                   0.564516   0.821114  0.946792  0.669056
food                    0.734082   0.675862  0.936612  0.703770
shelter                 0.794595   0.312102  0.930465  0.448171
clothing                0.722222   0.573529  0.991548  0.639344
money                   0.000000   0.000000  0.977910  0.000000
missing_people          0.000000   0.000

In [58]:
metrics_output_test_4.describe()

Unnamed: 0,Precision,Recall,Accuracy,f1
count,34.0,34.0,34.0,34.0
mean,0.418848,0.189488,0.936403,0.225905
std,0.373493,0.270047,0.077562,0.272236
min,0.0,0.0,0.635421,0.0
25%,0.0,0.0,0.93229,0.0
50%,0.540052,0.063547,0.956396,0.114773
75%,0.738368,0.230891,0.98204,0.358182
max,1.0,1.0,0.99539,0.866413


### 9. Export your model as a pickle file

Based on the performance from different algorithms tryout, LinearSVC outperformed which will used as the final classification method.

In [47]:
pickle.dump(tuned_model2, open('models/classifier.pkl', 'wb'))

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.