# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [13]:
# import libraries
import re
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import nltk
import sys
import os

# Download required NLTK resources 
nltk.download(['punkt','punkt_tab','wordnet'], quiet=True)
#nltk.download('omw-1.4')
#nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
import pickle

import warnings
import logging

# Suppress warnings
warnings.filterwarnings('ignore')

# Suppress logging
logging.getLogger().setLevel(logging.ERROR)

In [14]:
def load_data(database_filepath=None):
    """
    Load data from SQLite database
    
    Parameters:
    database_filepath (str, optional): Path to SQLite database
    
    Returns:
    X (pandas.Series): Feature data (messages)
    Y (pandas.DataFrame): Target data (categories)
    category_names (list): List of category names
    """
    # Get the current working directory
    current_dir = os.getcwd()
    
    # If no filepath provided, use default location
    if database_filepath is None:
        # Navigate up one directory to project root, then into data folder
        database_filepath = os.path.join(current_dir, '..', 'data', 'DisasterResponse.db')
    else:
        # If a relative path is provided, construct full path
        database_filepath = os.path.join(current_dir, '..', database_filepath)
    
    # Load data from database
    engine = create_engine(f'sqlite:///{database_filepath}')
    df = pd.read_sql_table('DisasterResponse', engine)
    
    # Define features and target
    X = df['message']
    Y = df.iloc[:, 4:]  # All columns from the 5th column onwards
    category_names = Y.columns
    
    return X, Y, category_names

### 2. Write a tokenization function to process your text data

In [15]:
def tokenize(text):
    """
    Process text data: tokenize, lemmatize, and clean
    
    Parameters:
    text (str): Text to be processed
    
    Returns:
    clean_tokens (list): List of cleaned tokens
    """
    # Replace URLs with a placeholder
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # Tokenize, lemmatize, and clean
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).strip()
        clean_tokens.append(clean_tok)
    
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [16]:
def build_model():
    """
    Build a machine learning pipeline with GridSearchCV
    
    Returns:
    cv (GridSearchCV): GridSearchCV object with pipeline and parameter grid
    """
    # Create pipeline
    pipeline = Pipeline([
        ('cvect', CountVectorizer(tokenizer=tokenize, token_pattern=None)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))
    ])
    
    # Define parameters for GridSearchCV
    parameters = {
        'cvect__max_features': [100, 200],
        'cvect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'clf__estimator__n_estimators': [50, 100]
    }
    
    # Create GridSearchCV object
    #cv = GridSearchCV(pipeline, param_grid=parameters, cv=3, verbose=2, n_jobs=-1)
    cv = GridSearchCV(pipeline, param_grid=parameters, cv=3, verbose=0, n_jobs=-1)
    
    return cv

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [17]:
# Define the database filepath
database_filepath = 'data/DisasterResponse.db'

# Then call load_data
X, Y, category_names = load_data(database_filepath)
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
        
#Build model
model = build_model()
        
#train model
model.fit(X_train, Y_train)

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [18]:
# Make predictions
Y_pred = model.predict(X_test)

In [19]:
def evaluate_model(Y_test, Y_pred, category_names):
    # Check dimensions of y_pred
    if len(Y_pred.shape) == 1:
        # If y_pred is 1D but y_test has multiple columns
        if len(category_names) > 1:
            print("Warning: y_pred is 1-dimensional but multiple categories were provided.")
            print("Make sure your model is properly configured for multi-label classification.")
            
        # For 1D predictions, we can only evaluate one category
        category = category_names[0]
        Y_true = Y_test.iloc[:, 0]
        Y_pred_cat = Y_pred
        
        report = classification_report(Y_true, Ypred_cat, output_dict=True, zero_division=0)
        df_report = pd.DataFrame(report).round(2)
        
        print(f'\n{"="*50}')
        print(f'Category: {category}')
        print(f'{"="*50}')
        
        # Style with colorblind-friendly colors
        styled_df = df_report.style\
            .background_gradient(cmap='YlOrBr')\
            .format(precision=2)\
            .set_table_styles([
                {'selector': 'th', 'props': [('background-color', '#eee'),
                                          ('color', '#000'),
                                          ('font-weight', 'bold')]},
                {'selector': '', 'props': [('border', '2px solid #333')]},
                {'selector': 'tbody td', 'props': [('border', '1px solid #666')]}
            ])\
            .highlight_max(axis=1, color='#8cc2dd')\
            .highlight_min(axis=1, color='#ffe5cc')
        
        display(styled_df)
    else:
        # Original multi-category evaluation
        for idx, category in enumerate(category_names):
            Y_true = Y_test.iloc[:, idx]
            # IMPORTANT FIX: Don't reuse the y_pred variable name inside the loop
            Y_pred_cat = Y_pred[:, idx]
            
            report = classification_report(Y_true, Y_pred_cat, output_dict=True, zero_division=0)
            df_report = pd.DataFrame(report).round(2)
            
            print(f'\n{"="*50}')
            print(f'Category: {category}')
            print(f'{"="*50}')
            
            # Style with colorblind-friendly colors
            styled_df = df_report.style\
                .background_gradient(cmap='YlOrBr')\
                .format(precision=2)\
                .set_table_styles([
                    {'selector': 'th', 'props': [('background-color', '#eee'),
                                              ('color', '#000'),
                                              ('font-weight', 'bold')]},
                    {'selector': '', 'props': [('border', '2px solid #333')]},
                    {'selector': 'tbody td', 'props': [('border', '1px solid #666')]}
                ])\
                .highlight_max(axis=1, color='#8cc2dd')\
                .highlight_min(axis=1, color='#ffe5cc')
            
            display(styled_df)

In [20]:
def predict_message(message, categories):
    """
    Predict categories for a single message
    
    Parameters:
    message (str): The input message to classify
    categories (list): List of category names
    
    Returns:
    dict: Predicted categories with value 1
    """
    # Predict categories for a single message
    prediction = model.predict([message])[0]
    
    # Create a dictionary of categories and their predictions
    results = dict(zip(categories, prediction))
    
    # Return only categories with prediction of 1
    return {category: result for category, result in results.items() if result == 1}

# When calling the function, pass the category_names
sample_message = "There's a flood in Houston, Texas. We need water and medical supplies."
predictions = predict_message(sample_message, category_names)
print("\nSample Message Predictions:")
print("Message:", sample_message)
print("Predictions:", predictions)



Sample Message Predictions:
Message: There's a flood in Houston, Texas. We need water and medical supplies.
Predictions: {'related': np.int64(1), 'request': np.int64(1), 'aid_related': np.int64(1), 'medical_help': np.int64(1), 'medical_products': np.int64(1), 'water': np.int64(1), 'weather_related': np.int64(1), 'floods': np.int64(1), 'direct_report': np.int64(1)}


In [21]:
Y_pred[111].shape

(36,)

In [22]:
for i in range(36):
    print("=======================",Y_test.columns[i],"======================")
    print(classification_report(Y_test.iloc[:,i], Y_pred[:,i], zero_division=0))

              precision    recall  f1-score   support

           0       0.65      0.31      0.43      1245
           1       0.82      0.95      0.88      3998

    accuracy                           0.80      5243
   macro avg       0.74      0.63      0.65      5243
weighted avg       0.78      0.80      0.77      5243

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      4352
           1       0.84      0.50      0.62       891

    accuracy                           0.90      5243
   macro avg       0.87      0.74      0.78      5243
weighted avg       0.89      0.90      0.89      5243

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5219
           1       0.00      0.00      0.00        24

    accuracy                           1.00      5243
   macro avg       0.50      0.50      0.50      5243
weighted avg       0.99      1.00      0.99      5243

              preci

### 6. Improve your model
Use grid search to find better parameters. 

In [23]:
# Parameter grid with exactly 4 combinations (2×2=4)
parameters = {
    'cvect__max_features': [100, 50],
    'cvect__ngram_range': [(1, 1), (1, 2)]
}

# Keep all other parameters fixed
pipeline = Pipeline([
    ('cvect', CountVectorizer(tokenizer=tokenize, token_pattern=None)),
    ('tfidf', TfidfTransformer(use_idf=True)),  # Fixed value
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, min_samples_split=2)))  # Fixed values
])

In [24]:
# This will result in 4 combinations × 5 folds = 20 fits
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cv_pipeline = GridSearchCV(pipeline, param_grid=parameters, cv=5, n_jobs=-1, verbose=2)
    cv_pipeline.fit(X_train, Y_train)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [26]:
# Print the best parameters found
print("Best parameters:", cv_pipeline.best_params_)

# Evaluate on test set
Y_pred = cv_pipeline.predict(X_test)
print("\nModel Performance with best parameters:")
evaluate_model(Y_test, Y_pred, category_names)

Best parameters: {'cvect__max_features': 100, 'cvect__ngram_range': (1, 1)}

Model Performance with best parameters:

Category: related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.63,0.81,0.79,0.72,0.77
recall,0.29,0.95,0.79,0.62,0.79
f1-score,0.4,0.87,0.79,0.64,0.76
support,1245.0,3998.0,0.79,5243.0,5243.0



Category: request


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.91,0.81,0.9,0.86,0.89
recall,0.98,0.5,0.9,0.74,0.9
f1-score,0.94,0.62,0.9,0.78,0.89
support,4352.0,891.0,0.9,5243.0,5243.0



Category: offer


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,1.0,0.0,1.0,0.5,0.99
recall,1.0,0.0,1.0,0.5,1.0
f1-score,1.0,0.0,1.0,0.5,0.99
support,5219.0,24.0,1.0,5243.0,5243.0



Category: aid_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.73,0.71,0.73,0.72,0.72
recall,0.84,0.56,0.73,0.7,0.73
f1-score,0.78,0.63,0.73,0.7,0.72
support,3079.0,2164.0,0.73,5243.0,5243.0



Category: medical_help


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.92,0.42,0.92,0.67,0.88
recall,1.0,0.01,0.92,0.51,0.92
f1-score,0.96,0.02,0.92,0.49,0.88
support,4808.0,435.0,0.92,5243.0,5243.0



Category: medical_products


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.95,0.6,0.95,0.77,0.93
recall,1.0,0.02,0.95,0.51,0.95
f1-score,0.97,0.04,0.95,0.51,0.92
support,4964.0,279.0,0.95,5243.0,5243.0



Category: search_and_rescue


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.97,0.0,0.97,0.49,0.95
recall,1.0,0.0,0.97,0.5,0.97
f1-score,0.99,0.0,0.97,0.49,0.96
support,5107.0,136.0,0.97,5243.0,5243.0



Category: security


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.98,0.5,0.98,0.74,0.97
recall,1.0,0.01,0.98,0.51,0.98
f1-score,0.99,0.02,0.98,0.51,0.97
support,5147.0,96.0,0.98,5243.0,5243.0



Category: military


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.97,0.0,0.97,0.48,0.94
recall,1.0,0.0,0.97,0.5,0.97
f1-score,0.98,0.0,0.97,0.49,0.95
support,5085.0,158.0,0.97,5243.0,5243.0



Category: child_alone


Unnamed: 0,0,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0
support,5243.0,1.0,5243.0,5243.0



Category: water


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.96,0.81,0.96,0.89,0.96
recall,0.99,0.47,0.96,0.73,0.96
f1-score,0.98,0.6,0.96,0.79,0.95
support,4908.0,335.0,0.96,5243.0,5243.0



Category: food


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.96,0.83,0.95,0.9,0.95
recall,0.98,0.68,0.95,0.83,0.95
f1-score,0.97,0.75,0.95,0.86,0.95
support,4659.0,584.0,0.95,5243.0,5243.0



Category: shelter


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.94,0.83,0.94,0.89,0.93
recall,0.99,0.36,0.94,0.68,0.94
f1-score,0.97,0.5,0.94,0.73,0.92
support,4775.0,468.0,0.94,5243.0,5243.0



Category: clothing


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.49,0.97
recall,1.0,0.0,0.99,0.5,0.99
f1-score,0.99,0.0,0.99,0.5,0.98
support,5173.0,70.0,0.99,5243.0,5243.0



Category: money


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.98,0.67,0.98,0.82,0.97
recall,1.0,0.04,0.98,0.52,0.98
f1-score,0.99,0.07,0.98,0.53,0.97
support,5131.0,112.0,0.98,5243.0,5243.0



Category: missing_people


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.49,0.98
recall,1.0,0.0,0.99,0.5,0.99
f1-score,0.99,0.0,0.99,0.5,0.98
support,5180.0,63.0,0.99,5243.0,5243.0



Category: refugees


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.97,0.0,0.97,0.48,0.94
recall,1.0,0.0,0.97,0.5,0.97
f1-score,0.98,0.0,0.97,0.49,0.95
support,5073.0,170.0,0.97,5243.0,5243.0



Category: death


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.95,0.73,0.95,0.84,0.94
recall,1.0,0.03,0.95,0.52,0.95
f1-score,0.98,0.06,0.95,0.52,0.93
support,4996.0,247.0,0.95,5243.0,5243.0



Category: other_aid


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.88,0.58,0.87,0.73,0.84
recall,0.99,0.08,0.87,0.53,0.87
f1-score,0.93,0.13,0.87,0.53,0.83
support,4551.0,692.0,0.87,5243.0,5243.0



Category: infrastructure_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.94,0.33,0.94,0.63,0.9
recall,1.0,0.01,0.94,0.5,0.94
f1-score,0.97,0.01,0.94,0.49,0.91
support,4907.0,336.0,0.94,5243.0,5243.0



Category: transport


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.96,0.25,0.95,0.6,0.92
recall,1.0,0.0,0.95,0.5,0.95
f1-score,0.98,0.01,0.95,0.49,0.93
support,5008.0,235.0,0.95,5243.0,5243.0



Category: buildings


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.95,0.7,0.95,0.83,0.94
recall,1.0,0.12,0.95,0.56,0.95
f1-score,0.98,0.2,0.95,0.59,0.94
support,4974.0,269.0,0.95,5243.0,5243.0



Category: electricity


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.98,0.0,0.98,0.49,0.96
recall,1.0,0.0,0.98,0.5,0.98
f1-score,0.99,0.0,0.98,0.49,0.97
support,5128.0,115.0,0.98,5243.0,5243.0



Category: tools


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.5,0.99
recall,1.0,0.0,0.99,0.5,0.99
f1-score,1.0,0.0,0.99,0.5,0.99
support,5208.0,35.0,0.99,5243.0,5243.0



Category: hospitals


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.5,0.98
recall,1.0,0.0,0.99,0.5,0.99
f1-score,1.0,0.0,0.99,0.5,0.99
support,5191.0,52.0,0.99,5243.0,5243.0



Category: shops


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,1.0,0.0,1.0,0.5,0.99
recall,1.0,0.0,1.0,0.5,1.0
f1-score,1.0,0.0,1.0,0.5,0.99
support,5218.0,25.0,1.0,5243.0,5243.0



Category: aid_centers


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.49,0.98
recall,1.0,0.0,0.99,0.5,0.99
f1-score,0.99,0.0,0.99,0.5,0.98
support,5179.0,64.0,0.99,5243.0,5243.0



Category: other_infrastructure


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.96,0.0,0.96,0.48,0.92
recall,1.0,0.0,0.96,0.5,0.96
f1-score,0.98,0.0,0.96,0.49,0.94
support,5018.0,225.0,0.96,5243.0,5243.0



Category: weather_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.83,0.83,0.83,0.83,0.83
recall,0.96,0.49,0.83,0.72,0.83
f1-score,0.89,0.61,0.83,0.75,0.81
support,3771.0,1472.0,0.83,5243.0,5243.0



Category: floods


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.94,0.89,0.94,0.92,0.94
recall,1.0,0.35,0.94,0.67,0.94
f1-score,0.97,0.5,0.94,0.74,0.93
support,4812.0,431.0,0.94,5243.0,5243.0



Category: storm


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.94,0.72,0.93,0.83,0.92
recall,0.99,0.36,0.93,0.67,0.93
f1-score,0.96,0.48,0.93,0.72,0.92
support,4764.0,479.0,0.93,5243.0,5243.0



Category: fire


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.49,0.98
recall,1.0,0.0,0.99,0.5,0.99
f1-score,0.99,0.0,0.99,0.5,0.98
support,5190.0,53.0,0.99,5243.0,5243.0



Category: earthquake


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.96,0.89,0.96,0.93,0.96
recall,0.99,0.67,0.96,0.83,0.96
f1-score,0.98,0.76,0.96,0.87,0.96
support,4728.0,515.0,0.96,5243.0,5243.0



Category: cold


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.98,0.0,0.98,0.49,0.96
recall,1.0,0.0,0.98,0.5,0.98
f1-score,0.99,0.0,0.98,0.49,0.97
support,5139.0,104.0,0.98,5243.0,5243.0



Category: other_weather


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.95,0.29,0.95,0.62,0.92
recall,1.0,0.01,0.95,0.5,0.95
f1-score,0.97,0.01,0.95,0.49,0.92
support,4976.0,267.0,0.95,5243.0,5243.0



Category: direct_report


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.87,0.76,0.86,0.82,0.85
recall,0.97,0.42,0.86,0.69,0.86
f1-score,0.92,0.54,0.86,0.73,0.85
support,4233.0,1010.0,0.86,5243.0,5243.0


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [30]:

def display_model_metrics(Y_test, Y_pred, category_names):
    """
    Display comprehensive metrics for model evaluation
    
    Parameters:
    y_test - DataFrame with true labels
    y_pred - Array with predictions (can be 1D or 2D)
    category_names - List of category names
    """
    # Check dimensions of y_pred
    if len(Y_pred.shape) == 1:
        # If y_pred is 1D but evaluating multiple categories
        if len(category_names) > 1:
            print("Warning: y_pred is 1-dimensional but multiple categories provided.")
            print("Evaluating only the first category.")
            category_names = [category_names[0]]
            Y_test = Y_test.iloc[:, 0:1]
            # Reshape for consistent handling
            Y_pred = Y_pred.reshape(-1, 1)
    
    # Overall accuracy for each category
    accuracies = {}
    for idx, category in enumerate(category_names):
        accuracies[category] = accuracy_score(Y_test.iloc[:, idx], Y_pred[:, idx])
    
    # Create accuracy DataFrame
    acc_df = pd.DataFrame.from_dict(accuracies, orient='index', columns=['Accuracy'])
    print("\nOverall Accuracy for Each Category:")
    display(acc_df.style.background_gradient(cmap='YlOrBr')
           .format(precision=3)
           .set_caption('Model Accuracy by Category'))
    
    # Detailed metrics for each category
    print("\nDetailed Classification Metrics by Category:")
    for idx, category in enumerate(category_names):
        Y_true = Y_test.iloc[:, idx]
        Y_pred_cat = Y_pred[:, idx]  # Changed variable name to avoid conflict
        
        report = classification_report(Y_true, Y_pred_cat, output_dict=True, zero_division=0)
        df_report = pd.DataFrame(report).round(3)
        
        print(f'\n{"="*20}')
        print(f'Category: {category}')
        print(f'{"="*20}')
        
        styled_df = df_report.style\
            .background_gradient(cmap='YlOrBr')\
            .format(precision=3)\
            .set_table_styles([
                {'selector': 'th', 'props': [('background-color', '#eee'),
                                          ('color', '#000'),
                                          ('font-weight', 'bold')]},
                {'selector': '', 'props': [('border', '2px solid #333')]},
                {'selector': 'tbody td', 'props': [('border', '1px solid #666')]}
            ])
        
        display(styled_df)

# Get predictions from your best model
best_model = cv_pipeline.best_estimator_
Y_pred = best_model.predict(X_test)

print("Best Model Evaluation Metrics:")
print("=" * 40)  # Increased visibility
print("\nBest Parameters found by GridSearchCV:")
print(cv_pipeline.best_params_)

# Display comprehensive metrics
display_model_metrics(Y_test, Y_pred, category_names)

# Calculate and display average metrics across all categories
print("\nAverage Metrics Across All Categories:")
print("=" * 40)  # Increased visibility

# Fixed variable name (y_test instead of Y_test)
avg_metrics = {
    'Avg Accuracy': np.mean([accuracy_score(Y_test.iloc[:, i], Y_pred[:, i]) for i in range(len(category_names))]),
    'Avg Precision': np.mean([precision_score(Y_test.iloc[:, i], Y_pred[:, i], average='weighted', zero_division=0) 
                            for i in range(len(category_names))]),
    'Avg Recall': np.mean([recall_score(Y_test.iloc[:, i], Y_pred[:, i], average='weighted', zero_division=0) 
                          for i in range(len(category_names))])
}

avg_df = pd.DataFrame([avg_metrics])
display(avg_df.style.format(precision=3)
       .background_gradient(cmap='YlOrBr')
       .set_caption('Average Metrics Across All Categories'))

Best Model Evaluation Metrics:

Best Parameters found by GridSearchCV:
{'cvect__max_features': 100, 'cvect__ngram_range': (1, 1)}

Overall Accuracy for Each Category:


Unnamed: 0,Accuracy
related,0.791
request,0.896
offer,0.995
aid_related,0.726
medical_help,0.917
medical_products,0.947
search_and_rescue,0.974
security,0.982
military,0.969
child_alone,1.0



Detailed Classification Metrics by Category:

Category: related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.627,0.812,0.791,0.719,0.768
recall,0.295,0.945,0.791,0.62,0.791
f1-score,0.401,0.873,0.791,0.637,0.761
support,1245.0,3998.0,0.791,5243.0,5243.0



Category: request


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.906,0.815,0.896,0.86,0.89
recall,0.977,0.503,0.896,0.74,0.896
f1-score,0.94,0.622,0.896,0.781,0.886
support,4352.0,891.0,0.896,5243.0,5243.0



Category: offer


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.995,0.0,0.995,0.498,0.991
recall,1.0,0.0,0.995,0.5,0.995
f1-score,0.998,0.0,0.995,0.499,0.993
support,5219.0,24.0,0.995,5243.0,5243.0



Category: aid_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.731,0.715,0.726,0.723,0.724
recall,0.843,0.558,0.726,0.701,0.726
f1-score,0.783,0.627,0.726,0.705,0.718
support,3079.0,2164.0,0.726,5243.0,5243.0



Category: medical_help


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.918,0.417,0.917,0.667,0.876
recall,0.999,0.011,0.917,0.505,0.917
f1-score,0.956,0.022,0.917,0.489,0.879
support,4808.0,435.0,0.917,5243.0,5243.0



Category: medical_products


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.948,0.6,0.947,0.774,0.929
recall,0.999,0.022,0.947,0.51,0.947
f1-score,0.973,0.042,0.947,0.507,0.923
support,4964.0,279.0,0.947,5243.0,5243.0



Category: search_and_rescue


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.974,0.0,0.974,0.487,0.949
recall,1.0,0.0,0.974,0.5,0.974
f1-score,0.987,0.0,0.974,0.493,0.961
support,5107.0,136.0,0.974,5243.0,5243.0



Category: security


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.982,0.5,0.982,0.741,0.973
recall,1.0,0.01,0.982,0.505,0.982
f1-score,0.991,0.02,0.982,0.506,0.973
support,5147.0,96.0,0.982,5243.0,5243.0



Category: military


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.97,0.0,0.969,0.485,0.941
recall,1.0,0.0,0.969,0.5,0.969
f1-score,0.985,0.0,0.969,0.492,0.955
support,5085.0,158.0,0.969,5243.0,5243.0



Category: child_alone


Unnamed: 0,0,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0
support,5243.0,1.0,5243.0,5243.0



Category: water


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.965,0.81,0.959,0.888,0.955
recall,0.992,0.472,0.959,0.732,0.959
f1-score,0.979,0.596,0.959,0.787,0.954
support,4908.0,335.0,0.959,5243.0,5243.0



Category: food


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.96,0.833,0.949,0.897,0.946
recall,0.983,0.676,0.949,0.83,0.949
f1-score,0.972,0.747,0.949,0.859,0.947
support,4659.0,584.0,0.949,5243.0,5243.0



Category: shelter


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.94,0.832,0.936,0.886,0.931
recall,0.993,0.359,0.936,0.676,0.936
f1-score,0.966,0.501,0.936,0.734,0.925
support,4775.0,468.0,0.936,5243.0,5243.0



Category: clothing


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.987,0.0,0.987,0.493,0.973
recall,1.0,0.0,0.987,0.5,0.987
f1-score,0.993,0.0,0.987,0.497,0.98
support,5173.0,70.0,0.987,5243.0,5243.0



Category: money


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.979,0.667,0.979,0.823,0.973
recall,1.0,0.036,0.979,0.518,0.979
f1-score,0.989,0.068,0.979,0.529,0.97
support,5131.0,112.0,0.979,5243.0,5243.0



Category: missing_people


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.988,0.0,0.988,0.494,0.976
recall,1.0,0.0,0.988,0.5,0.988
f1-score,0.994,0.0,0.988,0.497,0.982
support,5180.0,63.0,0.988,5243.0,5243.0



Category: refugees


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.968,0.0,0.967,0.484,0.936
recall,1.0,0.0,0.967,0.5,0.967
f1-score,0.983,0.0,0.967,0.492,0.952
support,5073.0,170.0,0.967,5243.0,5243.0



Category: death


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.954,0.727,0.954,0.841,0.944
recall,0.999,0.032,0.954,0.516,0.954
f1-score,0.976,0.062,0.954,0.519,0.933
support,4996.0,247.0,0.954,5243.0,5243.0



Category: other_aid


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.876,0.584,0.871,0.73,0.837
recall,0.992,0.075,0.871,0.534,0.871
f1-score,0.93,0.133,0.871,0.532,0.825
support,4551.0,692.0,0.871,5243.0,5243.0



Category: infrastructure_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.936,0.333,0.936,0.635,0.898
recall,0.999,0.006,0.936,0.503,0.936
f1-score,0.967,0.012,0.936,0.489,0.905
support,4907.0,336.0,0.936,5243.0,5243.0



Category: transport


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.955,0.25,0.955,0.603,0.924
recall,0.999,0.004,0.955,0.502,0.955
f1-score,0.977,0.008,0.955,0.493,0.933
support,5008.0,235.0,0.955,5243.0,5243.0



Category: buildings


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.954,0.705,0.952,0.829,0.941
recall,0.997,0.115,0.952,0.556,0.952
f1-score,0.975,0.198,0.952,0.587,0.935
support,4974.0,269.0,0.952,5243.0,5243.0



Category: electricity


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.978,0.0,0.978,0.489,0.957
recall,1.0,0.0,0.978,0.5,0.978
f1-score,0.989,0.0,0.978,0.494,0.967
support,5128.0,115.0,0.978,5243.0,5243.0



Category: tools


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.993,0.0,0.993,0.497,0.987
recall,1.0,0.0,0.993,0.5,0.993
f1-score,0.997,0.0,0.993,0.498,0.99
support,5208.0,35.0,0.993,5243.0,5243.0



Category: hospitals


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.495,0.98
recall,1.0,0.0,0.99,0.5,0.99
f1-score,0.995,0.0,0.99,0.498,0.985
support,5191.0,52.0,0.99,5243.0,5243.0



Category: shops


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.995,0.0,0.995,0.498,0.99
recall,1.0,0.0,0.995,0.5,0.995
f1-score,0.998,0.0,0.995,0.499,0.993
support,5218.0,25.0,0.995,5243.0,5243.0



Category: aid_centers


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.988,0.0,0.988,0.494,0.976
recall,1.0,0.0,0.988,0.5,0.988
f1-score,0.994,0.0,0.988,0.497,0.982
support,5179.0,64.0,0.988,5243.0,5243.0



Category: other_infrastructure


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.957,0.0,0.957,0.479,0.916
recall,1.0,0.0,0.957,0.5,0.957
f1-score,0.978,0.0,0.957,0.489,0.936
support,5018.0,225.0,0.957,5243.0,5243.0



Category: weather_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.828,0.826,0.828,0.827,0.827
recall,0.96,0.488,0.828,0.724,0.828
f1-score,0.889,0.614,0.828,0.752,0.812
support,3771.0,1472.0,0.828,5243.0,5243.0



Category: floods


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.945,0.888,0.943,0.917,0.94
recall,0.996,0.35,0.943,0.673,0.943
f1-score,0.97,0.502,0.943,0.736,0.931
support,4812.0,431.0,0.943,5243.0,5243.0



Category: storm


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.939,0.717,0.928,0.828,0.918
recall,0.986,0.359,0.928,0.672,0.928
f1-score,0.962,0.478,0.928,0.72,0.917
support,4764.0,479.0,0.928,5243.0,5243.0



Category: fire


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.0,0.99,0.495,0.98
recall,1.0,0.0,0.99,0.5,0.99
f1-score,0.995,0.0,0.99,0.497,0.985
support,5190.0,53.0,0.99,5243.0,5243.0



Category: earthquake


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.965,0.889,0.959,0.927,0.957
recall,0.991,0.666,0.959,0.828,0.959
f1-score,0.978,0.761,0.959,0.869,0.956
support,4728.0,515.0,0.959,5243.0,5243.0



Category: cold


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.98,0.0,0.98,0.49,0.961
recall,1.0,0.0,0.98,0.5,0.98
f1-score,0.99,0.0,0.98,0.495,0.97
support,5139.0,104.0,0.98,5243.0,5243.0



Category: other_weather


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.949,0.286,0.949,0.618,0.916
recall,0.999,0.007,0.949,0.503,0.949
f1-score,0.974,0.015,0.949,0.494,0.925
support,4976.0,267.0,0.949,5243.0,5243.0



Category: direct_report


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.874,0.759,0.862,0.816,0.852
recall,0.968,0.417,0.862,0.693,0.862
f1-score,0.919,0.538,0.862,0.728,0.846
support,4233.0,1010.0,0.862,5243.0,5243.0



Average Metrics Across All Categories:


Unnamed: 0,Avg Accuracy,Avg Precision,Avg Recall
0,0.944,0.929,0.944


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [37]:
# Custom classifier for columns with only one class
class SingleClassPredictor(BaseEstimator, ClassifierMixin):
    def __init__(self, constant_value=0):
        self.constant_value = constant_value
        
    def fit(self, X, Y):
        self.constant_value = int(Y.iloc[0]) if hasattr(Y, 'iloc') else int(Y[0])
        return self
        
    def predict(self, X):
        return np.full(X.shape[0], self.constant_value)

# Custom tokenizer
def custom_tokenize(text):
    """Enhanced tokenizer with better text preprocessing"""
    # Handle URLs if present
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_pattern, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # Clean text
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
    
    # Tokenize
    tokens = re.findall(r'\b\w+\b', text)
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(token).strip() for token in tokens if len(token) > 1]
    
    return clean_tokens

# Function to display comprehensive metrics for model evaluation
def display_model_metrics(Y_test, Y_pred, category_names):
    """
    Display comprehensive metrics for model evaluation
    
    Parameters:
    y_test - DataFrame with true labels
    y_pred - Array with predictions (can be 1D or 2D)
    category_names - List of category names
    """
    # Check dimensions of y_pred
    if len(Y_pred.shape) == 1:
        # If y_pred is 1D but evaluating multiple categories
        if len(category_names) > 1:
            print("Warning: y_pred is 1-dimensional but multiple categories provided.")
            print("Evaluating only the first category.")
            category_names = [category_names[0]]
            Y_test = Y_test.iloc[:, 0:1]
            # Reshape for consistent handling
            Y_pred = Y_pred.reshape(-1, 1)
    
    # Overall accuracy for each category
    accuracies = {}
    for idx, category in enumerate(category_names):
        accuracies[category] = accuracy_score(Y_test.iloc[:, idx], Y_pred[:, idx])
    
    # Create accuracy DataFrame
    acc_df = pd.DataFrame.from_dict(accuracies, orient='index', columns=['Accuracy'])
    print("\nOverall Accuracy for Each Category:")
    display(acc_df.style.background_gradient(cmap='YlOrBr')
           .format(precision=3)
           .set_caption('Model Accuracy by Category'))
    
    # Detailed metrics for each category
    print("\nDetailed Classification Metrics by Category:")
    for idx, category in enumerate(category_names):
        Y_true = Y_test.iloc[:, idx]
        Y_pred_cat = Y_pred[:, idx]  # Changed variable name to avoid conflict
        
        report = classification_report(Y_true, Y_pred_cat, output_dict=True, zero_division=0)
        df_report = pd.DataFrame(report).round(3)
        
        print(f'\n{"="*20}')
        print(f'Category: {category}')
        print(f'{"="*20}')
        
        styled_df = df_report.style\
            .background_gradient(cmap='YlOrBr')\
            .format(precision=3)\
            .set_table_styles([
                {'selector': 'th', 'props': [('background-color', '#eee'),
                                          ('color', '#000'),
                                          ('font-weight', 'bold')]},
                {'selector': '', 'props': [('border', '2px solid #333')]},
                {'selector': 'tbody td', 'props': [('border', '1px solid #666')]}
            ])
        
        display(styled_df)

# Fixed Gradient Boosting model
def build_fixed_gradient_boosting_model(X_train, X_test, Y_train, Y_test, category_names):
    """Build model with special handling for single-class columns and enhanced metrics"""
    
    # Create basic pipeline for feature extraction
    feature_pipeline = Pipeline([
        ('cvect', CountVectorizer(
            tokenizer=custom_tokenize,
            token_pattern=None,
            min_df=2,
            max_features=200
        )),
        ('tfidf', TfidfTransformer(use_idf=True))
    ])
    
    # Transform the features
    print("Extracting text features...")
    X_train_features = feature_pipeline.fit_transform(X_train)
    X_test_features = feature_pipeline.transform(X_test)
    
    # Train a separate classifier for each category
    print("Training individual classifiers for each category...")
    classifiers = {}
    Y_pred_all = np.zeros((X_test.shape[0], len(category_names)))
    
    for idx, category in enumerate(category_names):
        # Get target for this category
        Y_cat = Y_train.iloc[:, idx]
        
        # Check if this category has only one class
        unique_classes = Y_cat.unique()
        
        if len(unique_classes) == 1:
            print(f"Category '{category}' has only one class. Using constant predictor.")
            # Use a simple predictor that always returns the constant value
            clf = SingleClassPredictor(constant_value=unique_classes[0])
            clf.fit(X_train_features, Y_cat)
            Y_pred_all[:, idx] = clf.predict(X_test_features)
        else:
            print(f"Training GradientBoostingClassifier for '{category}'...")
            # Use GradientBoosting for categories with multiple classes
            clf = GradientBoostingClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=3,
                subsample=0.8,
                random_state=42,
                verbose=0
            )
            clf.fit(X_train_features, Y_cat)
            Y_pred_all[:, idx] = clf.predict(X_test_features)
        
        classifiers[category] = clf
    
    # Model parameters
    print("\nModel Configuration:")
    print("=" * 40)
    print("Features: CountVectorizer with custom tokenizer + TF-IDF")
    print("Max Features:", feature_pipeline.named_steps['cvect'].max_features)
    print("Classifier: GradientBoostingClassifier (n_estimators=100, learning_rate=0.1, max_depth=3)")
    
    # Use the enhanced display_model_metrics function for evaluation
    print("\nModel Evaluation Metrics:")
    print("=" * 40)
    
    # Display detailed metrics using the new function
    display_model_metrics(Y_test, Y_pred_all, category_names)
    
    # Calculate average metrics across all categories
    print("\nAverage Metrics Across All Categories:")
    print("=" * 40)
    avg_metrics = {
        'Avg Accuracy': np.mean([accuracy_score(Y_test.iloc[:, i], Y_pred_all[:, i]) for i in range(len(category_names))]),
        'Avg Precision': np.mean([precision_score(Y_test.iloc[:, i], Y_pred_all[:, i], average='weighted', zero_division=0) 
                                for i in range(len(category_names))]),
        'Avg Recall': np.mean([recall_score(Y_test.iloc[:, i], Y_pred_all[:, i], average='weighted', zero_division=0) 
                              for i in range(len(category_names))]),
        'Avg F1 Score': np.mean([f1_score(Y_test.iloc[:, i], Y_pred_all[:, i], average='weighted', zero_division=0)
                               for i in range(len(category_names))])
    }
    
    avg_df = pd.DataFrame([avg_metrics])
    display(avg_df.style.format(precision=3)
           .background_gradient(cmap='YlOrBr')
           .set_caption('Average Metrics Across All Categories'))
    
    return classifiers, feature_pipeline

In [38]:
classifiers, feature_pipeline = build_fixed_gradient_boosting_model(X_train, X_test, Y_train, Y_test, category_names)

Extracting text features...
Training individual classifiers for each category...
Training GradientBoostingClassifier for 'related'...
Training GradientBoostingClassifier for 'request'...
Training GradientBoostingClassifier for 'offer'...
Training GradientBoostingClassifier for 'aid_related'...
Training GradientBoostingClassifier for 'medical_help'...
Training GradientBoostingClassifier for 'medical_products'...
Training GradientBoostingClassifier for 'search_and_rescue'...
Training GradientBoostingClassifier for 'security'...
Training GradientBoostingClassifier for 'military'...
Category 'child_alone' has only one class. Using constant predictor.
Training GradientBoostingClassifier for 'water'...
Training GradientBoostingClassifier for 'food'...
Training GradientBoostingClassifier for 'shelter'...
Training GradientBoostingClassifier for 'clothing'...
Training GradientBoostingClassifier for 'money'...
Training GradientBoostingClassifier for 'missing_people'...
Training GradientBoostingC

Unnamed: 0,Accuracy
related,0.799
request,0.895
offer,0.994
aid_related,0.737
medical_help,0.92
medical_products,0.947
search_and_rescue,0.971
security,0.973
military,0.966
child_alone,1.0



Detailed Classification Metrics by Category:

Category: related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.732,0.805,0.799,0.768,0.787
recall,0.242,0.972,0.799,0.607,0.799
f1-score,0.364,0.881,0.799,0.622,0.758
support,1245.0,3998.0,0.799,5243.0,5243.0



Category: request


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.903,0.823,0.895,0.863,0.889
recall,0.979,0.486,0.895,0.732,0.895
f1-score,0.939,0.611,0.895,0.775,0.883
support,4352.0,891.0,0.895,5243.0,5243.0



Category: offer


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.995,0.0,0.994,0.498,0.991
recall,0.998,0.0,0.994,0.499,0.994
f1-score,0.997,0.0,0.994,0.498,0.992
support,5219.0,24.0,0.994,5243.0,5243.0



Category: aid_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.728,0.759,0.737,0.743,0.741
recall,0.881,0.531,0.737,0.706,0.737
f1-score,0.797,0.625,0.737,0.711,0.726
support,3079.0,2164.0,0.737,5243.0,5243.0



Category: medical_help


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.927,0.579,0.92,0.753,0.899
recall,0.991,0.143,0.92,0.567,0.92
f1-score,0.958,0.229,0.92,0.593,0.897
support,4808.0,435.0,0.92,5243.0,5243.0



Category: medical_products


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.951,0.5,0.947,0.725,0.927
recall,0.995,0.082,0.947,0.539,0.947
f1-score,0.973,0.142,0.947,0.557,0.928
support,4964.0,279.0,0.947,5243.0,5243.0



Category: search_and_rescue


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.975,0.13,0.971,0.552,0.953
recall,0.996,0.022,0.971,0.509,0.971
f1-score,0.985,0.038,0.971,0.511,0.961
support,5107.0,136.0,0.971,5243.0,5243.0



Category: security


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.982,0.056,0.973,0.519,0.965
recall,0.99,0.031,0.973,0.511,0.973
f1-score,0.986,0.04,0.973,0.513,0.969
support,5147.0,96.0,0.973,5243.0,5243.0



Category: military


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.97,0.077,0.966,0.524,0.943
recall,0.995,0.013,0.966,0.504,0.966
f1-score,0.983,0.022,0.966,0.502,0.954
support,5085.0,158.0,0.966,5243.0,5243.0



Category: child_alone


Unnamed: 0,0,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0
support,5243.0,1.0,5243.0,5243.0



Category: water


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.974,0.782,0.965,0.878,0.962
recall,0.988,0.621,0.965,0.805,0.965
f1-score,0.981,0.692,0.965,0.837,0.963
support,4908.0,335.0,0.965,5243.0,5243.0



Category: food


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.964,0.809,0.949,0.886,0.947
recall,0.979,0.709,0.949,0.844,0.949
f1-score,0.971,0.755,0.949,0.863,0.947
support,4659.0,584.0,0.949,5243.0,5243.0



Category: shelter


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.955,0.8,0.946,0.877,0.941
recall,0.987,0.521,0.946,0.754,0.946
f1-score,0.971,0.631,0.946,0.801,0.94
support,4775.0,468.0,0.946,5243.0,5243.0



Category: clothing


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.987,0.111,0.981,0.549,0.976
recall,0.994,0.057,0.981,0.525,0.981
f1-score,0.991,0.075,0.981,0.533,0.978
support,5173.0,70.0,0.981,5243.0,5243.0



Category: money


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.98,0.191,0.973,0.586,0.963
recall,0.993,0.08,0.973,0.536,0.973
f1-score,0.986,0.113,0.973,0.55,0.968
support,5131.0,112.0,0.973,5243.0,5243.0



Category: missing_people


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.988,0.0,0.98,0.494,0.976
recall,0.992,0.0,0.98,0.496,0.98
f1-score,0.99,0.0,0.98,0.495,0.978
support,5180.0,63.0,0.98,5243.0,5243.0



Category: refugees


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.969,0.278,0.965,0.624,0.947
recall,0.995,0.059,0.965,0.527,0.965
f1-score,0.982,0.097,0.965,0.539,0.953
support,5073.0,170.0,0.965,5243.0,5243.0



Category: death


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.957,0.585,0.954,0.771,0.94
recall,0.997,0.097,0.954,0.547,0.954
f1-score,0.976,0.167,0.954,0.572,0.938
support,4996.0,247.0,0.954,5243.0,5243.0



Category: other_aid


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.88,0.621,0.874,0.75,0.846
recall,0.99,0.111,0.874,0.55,0.874
f1-score,0.932,0.189,0.874,0.56,0.833
support,4551.0,692.0,0.874,5243.0,5243.0



Category: infrastructure_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.939,0.472,0.936,0.705,0.909
recall,0.996,0.051,0.936,0.523,0.936
f1-score,0.967,0.091,0.936,0.529,0.91
support,4907.0,336.0,0.936,5243.0,5243.0



Category: transport


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.963,0.581,0.957,0.772,0.946
recall,0.994,0.183,0.957,0.588,0.957
f1-score,0.978,0.278,0.957,0.628,0.947
support,5008.0,235.0,0.957,5243.0,5243.0



Category: buildings


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.961,0.66,0.955,0.811,0.946
recall,0.993,0.26,0.955,0.626,0.955
f1-score,0.977,0.373,0.955,0.675,0.946
support,4974.0,269.0,0.955,5243.0,5243.0



Category: electricity


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.981,0.483,0.978,0.732,0.97
recall,0.997,0.122,0.978,0.559,0.978
f1-score,0.989,0.194,0.978,0.592,0.971
support,5128.0,115.0,0.978,5243.0,5243.0



Category: tools


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.993,0.0,0.989,0.497,0.987
recall,0.995,0.0,0.989,0.498,0.989
f1-score,0.994,0.0,0.989,0.497,0.988
support,5208.0,35.0,0.989,5243.0,5243.0



Category: hospitals


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.991,0.182,0.986,0.586,0.983
recall,0.995,0.115,0.986,0.555,0.986
f1-score,0.993,0.141,0.986,0.567,0.985
support,5191.0,52.0,0.986,5243.0,5243.0



Category: shops


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.996,0.065,0.99,0.53,0.991
recall,0.994,0.08,0.99,0.537,0.99
f1-score,0.995,0.071,0.99,0.533,0.991
support,5218.0,25.0,0.99,5243.0,5243.0



Category: aid_centers


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.988,0.032,0.982,0.51,0.976
recall,0.994,0.016,0.982,0.505,0.982
f1-score,0.991,0.021,0.982,0.506,0.979
support,5179.0,64.0,0.982,5243.0,5243.0



Category: other_infrastructure


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.958,0.24,0.955,0.599,0.927
recall,0.996,0.027,0.955,0.511,0.955
f1-score,0.977,0.048,0.955,0.512,0.937
support,5018.0,225.0,0.955,5243.0,5243.0



Category: weather_related


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.838,0.869,0.843,0.854,0.847
recall,0.97,0.52,0.843,0.745,0.843
f1-score,0.899,0.651,0.843,0.775,0.829
support,3771.0,1472.0,0.843,5243.0,5243.0



Category: floods


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.948,0.856,0.944,0.902,0.94
recall,0.994,0.385,0.944,0.69,0.944
f1-score,0.97,0.531,0.944,0.751,0.934
support,4812.0,431.0,0.944,5243.0,5243.0



Category: storm


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.952,0.762,0.941,0.857,0.935
recall,0.984,0.509,0.941,0.747,0.941
f1-score,0.968,0.611,0.941,0.789,0.935
support,4764.0,479.0,0.941,5243.0,5243.0



Category: fire


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.99,0.041,0.981,0.515,0.981
recall,0.991,0.038,0.981,0.514,0.981
f1-score,0.991,0.039,0.981,0.515,0.981
support,5190.0,53.0,0.981,5243.0,5243.0



Category: earthquake


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.965,0.88,0.959,0.923,0.957
recall,0.99,0.67,0.959,0.83,0.959
f1-score,0.977,0.761,0.959,0.869,0.956
support,4728.0,515.0,0.959,5243.0,5243.0



Category: cold


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.981,0.07,0.971,0.525,0.963
recall,0.99,0.038,0.971,0.514,0.971
f1-score,0.985,0.05,0.971,0.517,0.967
support,5139.0,104.0,0.971,5243.0,5243.0



Category: other_weather


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.95,0.312,0.948,0.631,0.917
recall,0.998,0.019,0.948,0.508,0.948
f1-score,0.973,0.035,0.948,0.504,0.925
support,4976.0,267.0,0.948,5243.0,5243.0



Category: direct_report


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.874,0.788,0.866,0.831,0.858
recall,0.974,0.413,0.866,0.693,0.866
f1-score,0.921,0.542,0.866,0.732,0.848
support,4233.0,1010.0,0.866,5243.0,5243.0



Average Metrics Across All Categories:


Unnamed: 0,Avg Accuracy,Avg Precision,Avg Recall,Avg F1 Score
0,0.944,0.934,0.944,0.933


### 9. Export your model as a pickle file

In [40]:
# Get the current working directory
current_dir = os.getcwd()

# Construct the path to the models folder
# Navigate up one directory to the project root, then into models folder
model_path = os.path.join(current_dir, '..', 'models', 'classifier.pkl')

model_data = {
    'classifiers': classifiers,
    'feature_pipeline': feature_pipeline
}

# Ensure the directory exists
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Save the pickle file to the specified path
with open(model_path, 'wb') as f:
    pickle.dump(model_data, f)

In [42]:
# Get the current working directory
current_dir = os.getcwd()

# Construct the path to the models folder
# Navigate up one directory to the project root, then into models folder
model_path = os.path.join(current_dir, '..', 'models', 'classifier.pkl')

# Test loading the model
print("Loading model...")
with open(model_path, 'rb') as f:
    model_data = pickle.load(f)
    classifiers = model_data['classifiers']
    feature_pipeline = model_data['feature_pipeline']

# Get list of categories from the classifiers dictionary
categories = list(classifiers.keys())
print(f"Model loaded successfully with {len(categories)} categories: {categories}")


Loading model...
Model loaded successfully with 36 categories: ['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


### 10. Use this notebook to complete `train_classifier.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [2]:
#Export Python Notebook to html
!jupyter nbconvert --to html "ML_Pipeline_Preparation.ipynb"

[NbConvertApp] Converting notebook ML_Pipeline_Preparation.ipynb to html
[NbConvertApp] Writing 792299 bytes to ML_Pipeline_Preparation.html
