### 1. Import libraries and load data from database
- Import python libraries
- Load dataset from database with read_sql_table
- Define feature and target variables X & Y

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sqlalchemy import create_engine

In [4]:
#load engine from database 
engine = create_engine('sqlite:///data/DisasterResponse.db')

In [6]:
df = pd.read_sql_table('DisasterResponse1.sql',engine)

In [7]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X = df['message']

In [9]:
y = df.drop(['id','message','original','genre'],axis=1)

## 2. Write a tokenization function to process your data

In [11]:
import re
import nltk 

In [12]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/nirzaree/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nirzaree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nirzaree/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Try out the text processing steps and then write a function for it

In [14]:
text = X[0]
text

'Weather update - a cold front from Cuba that could pass over Haiti'

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
text = text.lower()

In [17]:
text

'weather update - a cold front from cuba that could pass over haiti'

In [18]:
text = re.sub(r'[^A-Za-z0-9]',' ',text)
text

'weather update   a cold front from cuba that could pass over haiti'

In [19]:
text = word_tokenize(text)
text

['weather',
 'update',
 'a',
 'cold',
 'front',
 'from',
 'cuba',
 'that',
 'could',
 'pass',
 'over',
 'haiti']

In [20]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [21]:
text = [x for x in text if x not in stop_words]
text

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pass', 'haiti']

In [22]:
lem_tok = []

In [23]:
for token in text: 
    lem_tok.append(lemmatizer.lemmatize(token))

In [24]:
lem_tok

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']

In [25]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z0-9]',' ',text)
    text = word_tokenize(text)
    stop_words = stopwords.words('english')
    text = [x for x in text if x not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lem_tokens = []
    for token in text:
        lem_tokens.append(lemmatizer.lemmatize(token))
    return lem_tokens    

### 3. Write a ML Pipeline

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [27]:
pipeline = Pipeline(
[
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('rf_multi', MultiOutputClassifier(RandomForestClassifier()))
]
)

## 4. Train pipeline

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [30]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fa1d88ef550>)),
                ('rf_multi',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

## 5. Test your model

In [36]:
from sklearn.metrics import classification_report

In [37]:
y_pred = pipeline.predict(X_test)

In [38]:
y_pred[0:3]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [39]:
y_test.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
13763,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
21716,1,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16002,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3745,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12096,1,0,0,1,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,1


In [40]:
y_test.columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [41]:
y_pred[:,0]

array([1, 1, 1, ..., 0, 1, 1])

In [43]:
y_test = y_test.applymap(int)

In [44]:
for colno in range(y_test.shape[1]):
    print("Target Column = ", y_test.columns[colno])
    print(classification_report(y_test.iloc[:,colno],y_pred[:,colno]))

Target Column =  related
              precision    recall  f1-score   support

           0       0.68      0.44      0.54      1509
           1       0.85      0.93      0.89      5004
           2       0.31      0.41      0.36        41

    accuracy                           0.82      6554
   macro avg       0.61      0.60      0.59      6554
weighted avg       0.81      0.82      0.80      6554

Target Column =  request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5419
           1       0.83      0.50      0.63      1135

    accuracy                           0.90      6554
   macro avg       0.87      0.74      0.78      6554
weighted avg       0.89      0.90      0.89      6554

Target Column =  offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6531
           1       0.00      0.00      0.00        23

    accuracy                           1.00      6554
 

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.87      0.99      0.93      5687
           1       0.52      0.04      0.08       867

    accuracy                           0.87      6554
   macro avg       0.70      0.52      0.50      6554
weighted avg       0.83      0.87      0.82      6554

Target Column =  infrastructure_related
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      6097
           1       0.31      0.01      0.02       457

    accuracy                           0.93      6554
   macro avg       0.62      0.50      0.49      6554
weighted avg       0.89      0.93      0.90      6554

Target Column =  transport
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6241
           1       0.80      0.11      0.19       313

    accuracy                           0.96      6554
   macro avg       0.88      0.55      0.58      6554
weighte

## 6. Save model

In [45]:
import pickle

In [50]:
model_filepath = 'models/classifier_without_tuning.pkl'

In [52]:
with open(model_filepath, 'wb') as f:
    pickle.dump(pipeline, f)