## 1. Import libraries and load data from database
- Import python libraries
- Load dataset from database with read_sql_table
- Define feature and target variables X & Y

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from sqlalchemy import create_engine

In [3]:
#load engine from database 
engine = create_engine('sqlite:///data/DisasterResponse.db')

In [4]:
df = pd.read_sql_table('DisasterResponse1.sql',engine)

In [5]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = df['message']

In [7]:
y = df.drop(['id','message','original','genre'],axis=1)

## 2. Write a tokenization function to process your data

In [9]:
import re
import nltk 

In [10]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/nirzaree/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nirzaree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nirzaree/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Try out the text processing steps and then write a function for it

In [12]:
text = X[0]
text

'Weather update - a cold front from Cuba that could pass over Haiti'

In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
text = text.lower()

In [15]:
text

'weather update - a cold front from cuba that could pass over haiti'

In [16]:
text = re.sub(r'[^A-Za-z0-9]',' ',text)
text

'weather update   a cold front from cuba that could pass over haiti'

In [17]:
text = word_tokenize(text)
text

['weather',
 'update',
 'a',
 'cold',
 'front',
 'from',
 'cuba',
 'that',
 'could',
 'pass',
 'over',
 'haiti']

In [18]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
text = [x for x in text if x not in stop_words]
text

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pass', 'haiti']

In [20]:
lem_tok = []

In [21]:
for token in text: 
    lem_tok.append(lemmatizer.lemmatize(token))

In [22]:
lem_tok

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']

In [23]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z0-9]',' ',text)
    text = word_tokenize(text)
    stop_words = stopwords.words('english')
    text = [x for x in text if x not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lem_tokens = []
    for token in text:
        lem_tokens.append(lemmatizer.lemmatize(token))
    return lem_tokens    

In [24]:
tokenize(X[0])

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']

### TFIDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
test_tfidf = ["I like India","India is my homeland","I have travelled to Asia"]

In [27]:
tokenized_test = [tokenize(x) for x in test_tfidf] 

In [28]:
tfidf = TfidfVectorizer(tokenizer=tokenize)

In [29]:
tfidf.fit_transform(test_tfidf)

<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [31]:
tfidf.fit_transform(test_tfidf).toarray()

array([[0.        , 0.        , 0.60534851, 0.79596054, 0.        ],
       [0.        , 0.79596054, 0.60534851, 0.        , 0.        ],
       [0.70710678, 0.        , 0.        , 0.        , 0.70710678]])

## 3. Write a ML Pipeline

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [37]:
pipeline = Pipeline(
[
    ('vect', TfidfVectorizer(tokenizer=tokenize)),
    ('rf_multi', MultiOutputClassifier(RandomForestClassifier()))
]
)

## 4. Train pipeline

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [42]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   TfidfVectorizer(tokenizer=<function tokenize at 0x7f96e61cfee0>)),
  ('rf_multi', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': TfidfVectorizer(tokenizer=<function tokenize at 0x7f96e61cfee0>),
 'rf_multi': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.float64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__norm': 'l2',
 'vect__preprocessor': None,
 'vect__smooth_idf': True,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__sublinear_tf': False,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__use_idf': True,
 'vect__vocabulary': None,
 'rf_multi__estimator__bootstrap': Tru

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
CV = GridSearchCV(pipeline,
             param_grid={'rf_multi__estimator__n_estimators': [50,100]},
             cv=2,
             n_jobs=1,
             verbose=3
            )

In [45]:
CV.fit(X_train,y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] rf_multi__estimator__n_estimators=50 ............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  rf_multi__estimator__n_estimators=50, score=0.258, total= 1.4min
[CV] rf_multi__estimator__n_estimators=50 ............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


[CV]  rf_multi__estimator__n_estimators=50, score=0.257, total= 1.5min
[CV] rf_multi__estimator__n_estimators=100 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.9min remaining:    0.0s


[CV]  rf_multi__estimator__n_estimators=100, score=0.261, total= 2.7min
[CV] rf_multi__estimator__n_estimators=100 ...........................
[CV]  rf_multi__estimator__n_estimators=100, score=0.257, total= 2.6min


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  8.2min finished


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(tokenizer=<function tokenize at 0x7f96e61cfee0>)),
                                       ('rf_multi',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             n_jobs=1,
             param_grid={'rf_multi__estimator__n_estimators': [50, 100]},
             verbose=3)

In [46]:
CV.predict(X_test)

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

## 5. Test your model

In [47]:
from sklearn.metrics import classification_report

In [48]:
y_pred = CV.predict(X_test)

In [49]:
y_pred[0:3]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [50]:
y_test.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
13763,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
21716,1,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16002,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3745,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12096,1,0,0,1,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,1


In [51]:
y_test.columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [77]:
print(classification_report(y_test.iloc[:,0],y_pred[:,0]))

              precision    recall  f1-score   support

           0       0.70      0.40      0.51      1509
           1       0.84      0.95      0.89      5004
           2       0.50      0.49      0.49        41

    accuracy                           0.82      6554
   macro avg       0.68      0.61      0.63      6554
weighted avg       0.80      0.82      0.80      6554



In [83]:
y_test.iloc[:,15].value_counts()

0    6473
1      81
Name: missing_people, dtype: int64

In [63]:
for colno in range(y_test.shape[1]):
    print("Target Column = ", y_test.columns[colno])
    print(classification_report(y_test.iloc[:,colno],y_pred[:,colno]))

Target Column =  related
              precision    recall  f1-score   support

           0       0.70      0.40      0.51      1509
           1       0.84      0.95      0.89      5004
           2       0.50      0.49      0.49        41

    accuracy                           0.82      6554
   macro avg       0.68      0.61      0.63      6554
weighted avg       0.80      0.82      0.80      6554

Target Column =  request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5419
           1       0.83      0.49      0.61      1135

    accuracy                           0.89      6554
   macro avg       0.87      0.73      0.78      6554
weighted avg       0.89      0.89      0.88      6554

Target Column =  offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6531
           1       0.00      0.00      0.00        23

    accuracy                           1.00      6554
 

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6554

    accuracy                           1.00      6554
   macro avg       1.00      1.00      1.00      6554
weighted avg       1.00      1.00      1.00      6554

Target Column =  water
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6130
           1       0.87      0.33      0.48       424

    accuracy                           0.95      6554
   macro avg       0.91      0.66      0.73      6554
weighted avg       0.95      0.95      0.94      6554

Target Column =  food
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5827
           1       0.86      0.65      0.74       727

    accuracy                           0.95      6554
   macro avg       0.91      0.82      0.86      6554
weighted avg       0.95      0.95      0.95      6554

Target Column =  shelter
   

In [74]:
## test for a new input
test_input = ["I have not had anything to eat in the last 2 days"]

In [75]:
test_pred = CV.predict(test_input)

In [76]:
test_pred = [int(x) for interim in test_pred for x in interim]


In [77]:
dict(zip(y_train.columns,test_pred))

{'related': 1,
 'request': 0,
 'offer': 0,
 'aid_related': 0,
 'medical_help': 0,
 'medical_products': 0,
 'search_and_rescue': 0,
 'security': 0,
 'military': 0,
 'child_alone': 0,
 'water': 0,
 'food': 0,
 'shelter': 0,
 'clothing': 0,
 'money': 0,
 'missing_people': 0,
 'refugees': 0,
 'death': 0,
 'other_aid': 0,
 'infrastructure_related': 0,
 'transport': 0,
 'buildings': 0,
 'electricity': 0,
 'tools': 0,
 'hospitals': 0,
 'shops': 0,
 'aid_centers': 0,
 'other_infrastructure': 0,
 'weather_related': 0,
 'floods': 0,
 'storm': 0,
 'fire': 0,
 'earthquake': 0,
 'cold': 0,
 'other_weather': 0,
 'direct_report': 0}

In [121]:
elec_inp = X_train[ y_train['electricity'] == '1']

In [129]:
elec_inp.iloc[4]

'The storm destroyed 275,000 acres (111,288 hectares) of farmland, breached 600 irrigation canals, swept away hundreds of bridges and roads and snapped electricity supplies across wide swathes of the state.'

## 6. Save model

In [45]:
import pickle

In [49]:
model_filepath = '../models/classifier.pkl'

In [50]:
with open(model_filepath, 'wb') as f:
    pickle.dump(pipeline, f)
