# ML Pipeline
Applied commands that constitude an Machine Learning (ML) Pipeline.
### Libraries & Datasets
- Import Python libraries
- Load dataset from database 
- Define feature and target variables X and Y

In [1]:
# Importing required libraries
import re
import pickle
import pandas as pd 
from sqlalchemy import create_engine
import nltk 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split,  GridSearchCV 
from sklearn.metrics import classification_report


nltk.download(['wordnet', 'punkt', 'stopwords'])

[nltk_data] Downloading package wordnet to /Users/NS/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/NS/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/NS/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# load data from database

engine = create_engine('sqlite:///Data/disaster_messages_database.db')
df = pd.read_sql_table('disaster_messages_table',engine)

X = df['message']  # Extracting the 'message'column
Y = df.iloc[:, 4:] # Extracting classified labels 

In [8]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Tokenization Function For Text Data Processing  

In [9]:
def tokenize(text):
    """
    Function: splits the texts into words and returns the root form of the words
    Args:
      text(str): the message
    Return:
      lemm(list of str): a list of the root form of the message words
    """
    # Normalizing text (a-zA-Z0-9 matches all allalphanumeric characters)
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # Tokenizing text
    words = word_tokenize(text)
    
    # Removing stop words
    stop = stopwords.words("english")
    words = [t for t in words if t not in stop]
    
    # Lemmatization
    lemm = [WordNetLemmatizer().lemmatize(w) for w in words]
    return lemm

### Constructing Machine Learning Pipeline
This machine pipeline will take in the `message` column as input and returns classification results on the other 36 categories in the dataset.

In [10]:
# Pipleine 1: Random Forest Classifier

pipeline_rfc = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',  MultiOutputClassifier(RandomForestClassifier()))
    ])


# Pipeline 2: Naive Bayes classifier

pipeline_nbc = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',  MultiOutputClassifier(MultinomialNB()))
    ])


# Pipleine 3: Adaboost Classifier 

pipeline_ada = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',  MultiOutputClassifier(AdaBoostClassifier()))
    ])

### Training Pipelines
- Split data into train and test sets
- Train pipeline

In [11]:
# Spiliting data
X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [12]:
# Fit the Random Forest Classifier 
pipeline_rfc.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

In [13]:
# Fit the Naive Bayes classifier 
pipeline_nbc.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator=MultinomialNB()))])

In [14]:
# Fit the Adaboost Classifier 
pipeline_ada.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator=AdaBoostClassifier()))])

### Testing Models
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [16]:
def plot_scores(y_test, y_pred):
    #Testing the model
    # Printing the classification report for each label
    i = 0
    for col in y_test:
        print('Feature {}: {}'.format(i+1, col))
        print(classification_report(y_test[col], y_pred[:, i]))
        i = i + 1
    accuracy = (y_pred == y_test.values).mean()
    print('The model accuracy is {:.3f}'.format(accuracy))

In [17]:
# Prediction: the Random Forest Classifier  
y_pred = pipeline_rfc.predict(X_test)
plot_scores(y_test, y_pred)

Feature 1: related
              precision    recall  f1-score   support

           0       0.71      0.42      0.53      1577
           1       0.84      0.94      0.89      4927
           2       0.23      0.42      0.30        50

    accuracy                           0.81      6554
   macro avg       0.59      0.59      0.57      6554
weighted avg       0.80      0.81      0.80      6554

Feature 2: request
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5445
           1       0.83      0.50      0.63      1109

    accuracy                           0.90      6554
   macro avg       0.87      0.74      0.78      6554
weighted avg       0.89      0.90      0.89      6554

Feature 3: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6524
           1       0.00      0.00      0.00        30

    accuracy                           1.00      6554
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.87      1.00      0.93      5664
           1       0.59      0.03      0.06       890

    accuracy                           0.87      6554
   macro avg       0.73      0.52      0.50      6554
weighted avg       0.83      0.87      0.81      6554

Feature 20: infrastructure_related
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      6130
           1       0.00      0.00      0.00       424

    accuracy                           0.94      6554
   macro avg       0.47      0.50      0.48      6554
weighted avg       0.87      0.94      0.90      6554

Feature 21: transport
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6258
           1       0.77      0.08      0.15       296

    accuracy                           0.96      6554
   macro avg       0.87      0.54      0.56      6554
weighted avg     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# Prediction: the Naive Bayes classifier 
y_pred = pipeline_nbc.predict(X_test)
plot_scores(y_test, y_pred)

Feature 1: related
              precision    recall  f1-score   support

           0       0.81      0.09      0.17      1577
           1       0.77      0.99      0.87      4927
           2       0.00      0.00      0.00        50

    accuracy                           0.77      6554
   macro avg       0.53      0.36      0.34      6554
weighted avg       0.77      0.77      0.69      6554

Feature 2: request
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      5445
           1       0.82      0.22      0.35      1109

    accuracy                           0.86      6554
   macro avg       0.84      0.60      0.63      6554
weighted avg       0.86      0.86      0.82      6554

Feature 3: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6524
           1       0.00      0.00      0.00        30

    accuracy                           1.00      6554
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.86      1.00      0.93      5664
           1       0.00      0.00      0.00       890

    accuracy                           0.86      6554
   macro avg       0.43      0.50      0.46      6554
weighted avg       0.75      0.86      0.80      6554

Feature 20: infrastructure_related
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      6130
           1       0.00      0.00      0.00       424

    accuracy                           0.94      6554
   macro avg       0.47      0.50      0.48      6554
weighted avg       0.87      0.94      0.90      6554

Feature 21: transport
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      6258
           1       0.00      0.00      0.00       296

    accuracy                           0.95      6554
   macro avg       0.48      0.50      0.49      6554
weighted avg     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6428
           1       0.00      0.00      0.00       126

    accuracy                           0.98      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.96      0.98      0.97      6554

Feature 35: other_weather
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      6211
           1       0.00      0.00      0.00       343

    accuracy                           0.95      6554
   macro avg       0.47      0.50      0.49      6554
weighted avg       0.90      0.95      0.92      6554

Feature 36: direct_report
              precision    recall  f1-score   support

           0       0.83      0.99      0.90      5299
           1       0.77      0.16      0.27      1255

    accuracy                           0.83      6554
   macro avg       0.80      0.58      0.59      6554
weighted avg       0.8

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# Prediction: the Adaboost Classifier 
y_pred = pipeline_ada.predict(X_test)
plot_scores(y_test, y_pred)

Feature 1: related
              precision    recall  f1-score   support

           0       0.68      0.09      0.16      1577
           1       0.77      0.99      0.86      4927
           2       0.36      0.10      0.16        50

    accuracy                           0.76      6554
   macro avg       0.60      0.39      0.39      6554
weighted avg       0.74      0.76      0.69      6554

Feature 2: request
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      5445
           1       0.74      0.51      0.60      1109

    accuracy                           0.89      6554
   macro avg       0.83      0.74      0.77      6554
weighted avg       0.88      0.89      0.88      6554

Feature 3: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6524
           1       0.00      0.00      0.00        30

    accuracy                           0.99      6554
   macro avg       

### Improving Models
Grid Search can be utilised to find optimal parameters. 

In [20]:
# Displaying parameters for the pipline
pipeline_rfc.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf_

In [22]:
# Displaying parameters for the pipline
pipeline_nbc.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=MultinomialNB()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=MultinomialNB()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 

In [21]:
# Displaying parameters for the pipline
pipeline_ada.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=AdaBoostClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=AdaBoostClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf

In [25]:
# Creating Grid search parameters for Random Forest Classifier   
parameters_rfc = {
        'tfidf__use_idf': (True, False),
        'clf__estimator__n_estimators': [10, 20]
}

cv_rfc = GridSearchCV(pipeline_rfc, param_grid = parameters_rfc)
cv_rfc

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'clf__estimator__n_estimators': [10, 20],
                         'tfidf__use_idf': (True, False)})

In [26]:
# Creating Grid search parameters for Adaboost Classifier 
parameters_ada = {
        'tfidf__use_idf': (True, False),
        'clf__estimator__n_estimators': [50, 60, 70]
}

cv_ada = GridSearchCV(pipeline_ada, param_grid = parameters_ada)
cv_ada

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=AdaBoostClassifier()))]),
             param_grid={'clf__estimator__n_estimators': [50, 60, 70],
                         'tfidf__use_idf': (True, False)})

### Testing Models
Show the accuracy, precision, and recall of the tuned model.  

This project mainly concentarets on code quality, process, and  pipelines. There are no requirements of minimum performance metric needed. Althought,  ensuring optimisation to fine tune your models for accuracy, precision and recall is gold standard practice.

In [27]:
# Fit the first tuned model
cv_rfc.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'clf__estimator__n_estimators': [10, 20],
                         'tfidf__use_idf': (True, False)})

In [28]:
# Fit the second tuned model
cv_ada.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fd0b037eee0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=AdaBoostClassifier()))]),
             param_grid={'clf__estimator__n_estimators': [50, 60, 70],
                         'tfidf__use_idf': (True, False)})

In [29]:
# Predicting using the first tuned model for RFC
y_pred = cv_rfc.predict(X_test)
plot_scores(y_test, y_pred)

Feature 1: related
              precision    recall  f1-score   support

           0       0.68      0.45      0.54      1577
           1       0.84      0.93      0.88      4927
           2       0.23      0.40      0.29        50

    accuracy                           0.81      6554
   macro avg       0.59      0.59      0.57      6554
weighted avg       0.80      0.81      0.79      6554

Feature 2: request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5445
           1       0.81      0.45      0.58      1109

    accuracy                           0.89      6554
   macro avg       0.85      0.72      0.76      6554
weighted avg       0.88      0.89      0.88      6554

Feature 3: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6524
           1       0.00      0.00      0.00        30

    accuracy                           1.00      6554
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6445
           1       0.64      0.06      0.12       109

    accuracy                           0.98      6554
   macro avg       0.81      0.53      0.55      6554
weighted avg       0.98      0.98      0.98      6554

Feature 15: money
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6399
           1       1.00      0.03      0.06       155

    accuracy                           0.98      6554
   macro avg       0.99      0.52      0.53      6554
weighted avg       0.98      0.98      0.97      6554

Feature 16: missing_people
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6484
           1       1.00      0.03      0.06        70

    accuracy                           0.99      6554
   macro avg       0.99      0.51      0.53      6554
weighted avg       0.99      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Predicting using the second tuned model for ADA
y_pred = cv_ada.predict(X_test)
plot_scores(y_test, y_pred)

Feature 1: related
              precision    recall  f1-score   support

           0       0.68      0.11      0.18      1577
           1       0.77      0.98      0.86      4927
           2       0.31      0.10      0.15        50

    accuracy                           0.76      6554
   macro avg       0.59      0.40      0.40      6554
weighted avg       0.74      0.76      0.69      6554

Feature 2: request
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      5445
           1       0.74      0.51      0.61      1109

    accuracy                           0.89      6554
   macro avg       0.82      0.74      0.77      6554
weighted avg       0.88      0.89      0.88      6554

Feature 3: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6524
           1       0.00      0.00      0.00        30

    accuracy                           0.99      6554
   macro avg       

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      4753
           1       0.85      0.66      0.75      1801

    accuracy                           0.88      6554
   macro avg       0.87      0.81      0.83      6554
weighted avg       0.87      0.88      0.87      6554

Feature 30: floods
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      6025
           1       0.83      0.58      0.69       529

    accuracy                           0.96      6554
   macro avg       0.90      0.79      0.83      6554
weighted avg       0.95      0.96      0.95      6554

Feature 31: storm
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      5938
           1       0.76      0.53      0.62       616

    accuracy                           0.94      6554
   macro avg       0.85      0.75      0.79      6554
weighted avg       0.93      0.94    

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF