In [2]:
#scratch for train_classifier.py

In [18]:
#imports
import sys
import pickle
import nltk
import pandas as pd
import numpy as np
import re
import string
import sqlite3
from sqlalchemy import create_engine

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.multioutput import MultiOutputClassifier

nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords', 'omw-1.4'])


[nltk_data] Downloading package punkt to /home/brendo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/brendo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/brendo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/brendo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/brendo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def load_data(database_filepath):
    # load data from database
    engine = create_engine('sqlite:///' + database_filepath)
    conn = sqlite3.connect(database_filepath)
    sqlquery = "SELECT * FROM \'" + database_filepath + "\'" 
    df = pd.read_sql(sqlquery,conn)
    
    X = df.message.values
    categories = ['related','request','offer','aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']

    y = []
    for category in categories:
        y.append(df[category].values)
    y = np.transpose(np.array(y))
    return X, y, categories

In [3]:
X, y, categories = load_data("data/DisasterResponse.db")

In [4]:
#This function needs to: use a custom tokenize function using nltk to case normalize, lemmatize, and tokenize text. 
#This function is used in the machine learning pipeline to vectorize and then apply TF-IDF to the text.
def tokenize(text):

    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        
    return tokens


In [5]:
#The script builds a pipeline that processes text and then performs multi-output classification on the 36 categories in the dataset. 
#GridSearchCV is used to find the best parameters for the model.
#The TF-IDF pipeline is only trained with the training data. 
def build_model():

    pipeline = Pipeline([
        ('vect', TfidfVectorizer(tokenizer=tokenize)),
        ('clf', AdaBoostClassifier(random_state=42))
    ])
    
    '''
    parameters = {
        'clf__n_estimators' : [10,25,50],
        'clf__max_leaf_nodes' : [None, 50]
    }
    
    cv = GridSearchCV(pipeline, param_grid=parameters)
    '''
    
    multi_target_forest = MultiOutputClassifier(pipeline,n_jobs=1)
    return multi_target_forest


In [6]:
#The f1 score, precision and recall for the test set is outputted for each category.
def evaluate_model(model, X_test, Y_test, category_names):
    
    Y_pred = model.predict(X_test)
    
    labels = category_names
    #confusion_mat = confusion_matrix(Y_test, Y_pred, labels=labels)
    accuracy = (Y_pred == Y_test).mean()

    print("Labels:", labels)
    #print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    #print("\nBest Parameters:", cv.best_params_)


In [7]:
X, Y, category_names = load_data("data/DisasterResponse.db")

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [9]:
model=build_model()

In [10]:
model.fit(X_train,Y_train)

In [12]:
Y_pred = model.predict(X_test)

In [14]:
model.score(X_test,Y_test)

0.2231121281464531

In [27]:
#score is only for full accuracy, need to score based on each category
#F1 = 2 * (precision * recall) / (precision + recall)
#Precision = True Pos / (True Pos + False Pos)
#Recall = True Pos / (True Pos + False Neg)
#compare Y_test to Y_pred by each val
#2 loops first, then vectorize if we can
#create calc function

In [67]:
#init 36 len arrays to store each
true_pos = np.zeros((36,))
false_pos = np.zeros((36,))
false_neg = np.zeros((36,))

#loop through and add up each
for array in range(len(Y_pred)):
    for entry in range(len(Y_pred[array])):
        pred_val = Y_pred[array][entry]
        true_val = Y_test[array][entry]
        
        #condition for true positive
        if pred_val == true_val:
            true_pos[entry] += 1 
        #condition for false pos
        elif (pred_val == 1) & (true_val == 0):
            false_pos[entry] += 1
        #condition for false neg
        elif (pred_val == 0) & (true_val == 1):
            false_neg[entry] += 1

#loop through and define precision and recall for each category
precision = np.zeros((36,))
recall = np.zeros((36,))

for i in range(len(precision)):
    precision[i] = true_pos[i] / (true_pos[i] + false_pos[i])
    recall[i] = true_pos[i] / (true_pos[i] + false_neg[i])

f1 = np.zeros((36,))
#calculate f1 for each
for i in range(len(f1)):
    f1[i] = 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])

In [75]:
#store results in dataframe
results = pd.DataFrame()
results["Category"] = categories
results["f1"] = f1
results["precision"] = precision
results["recall"] = recall
results

Unnamed: 0,Category,f1,precision,recall
0,related,0.873392,0.786563,0.98177
1,request,0.939749,0.968737,0.912446
2,offer,0.996075,0.998273,0.993887
3,aid_related,0.857641,0.89012,0.827449
4,medical_help,0.962098,0.984806,0.940414
5,medical_products,0.979171,0.990938,0.96768
6,search_and_rescue,0.986373,0.995902,0.977025
7,security,0.990471,0.995742,0.985255
8,military,0.986569,0.992997,0.980223
9,child_alone,1.0,1.0,1.0
