In [2]:
#scratch for train_classifier.py

In [10]:
#imports
import sys
import pickle
import nltk
import pandas as pd
import numpy as np
import re
import string
import sqlite3
from sqlalchemy import create_engine

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.multioutput import MultiOutputClassifier

nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords', 'omw-1.4'])


[nltk_data] Downloading package punkt to /home/brendo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/brendo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/brendo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/brendo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/brendo/nltk_data...


True

In [48]:
def load_data(database_filepath):
    # load data from database
    engine = create_engine('sqlite:///' + database_filepath)
    conn = sqlite3.connect(database_filepath)
    sqlquery = "SELECT * FROM \'" + database_filepath + "\'" 
    df = pd.read_sql(sqlquery,conn)
    
    X = df.message.values
    categories = ['related','request','offer','aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']

    y = []
    for category in categories:
        y.append(df[category].values)
    y = np.transpose(np.array(y))
    return X, y, categories

In [53]:
X, y, categories = load_data("data/DisasterResponse.db")

In [84]:
#This function needs to: use a custom tokenize function using nltk to case normalize, lemmatize, and tokenize text. 
#This function is used in the machine learning pipeline to vectorize and then apply TF-IDF to the text.
def tokenize(text):

    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        
    return tokens


In [94]:
def build_model():

    pipeline = Pipeline([
        ('vect', TfidfVectorizer(tokenizer=tokenize)),
        ('clf', AdaBoostClassifier(random_state=42))
    ])
    
    '''
    parameters = {
        'clf__n_estimators' : [10,25,50],
        'clf__max_leaf_nodes' : [None, 50]
    }
    
    cv = GridSearchCV(pipeline, param_grid=parameters)
    '''
    
    multi_target_forest = MultiOutputClassifier(pipeline,n_jobs=1)
    return multi_target_forest


In [91]:
#The script builds a pipeline that processes text and then performs multi-output classification on the 36 categories in the dataset. 
#GridSearchCV is used to find the best parameters for the model.
#The TF-IDF pipeline is only trained with the training data. The f1 score, precision and recall for the test set is outputted for each category.
def evaluate_model(model, X_test, Y_test, category_names):
    
    Y_pred = model.predict(X_test)
    
    labels = category_names
    #confusion_mat = confusion_matrix(Y_test, Y_pred, labels=labels)
    accuracy = (Y_pred == Y_test).mean()

    print("Labels:", labels)
    #print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    #print("\nBest Parameters:", cv.best_params_)


In [92]:
X, Y, category_names = load_data("data/DisasterResponse.db")

In [96]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [97]:
model=build_model()

In [98]:
model.fit(X_train,Y_train)