# ML Pipeline Preparation

### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database 
- Define feature and target variables X and Y

In [1]:
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xiaorong/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/xiaorong/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xiaorong/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
# import libraries
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
import mlsmote

In [3]:
# load data from database
engine = create_engine('sqlite:///messages.db')
conn = engine.connect()
df = pd.read_sql('SELECT * FROM messages', con=conn)
df.head()
df.drop(['child_alone'], axis = 1, inplace = True) # This is because no sample has l for label 'child_alone', all 0's.
X = df['message']
Y = df.iloc[:, 4: ]
#X = X.iloc[:10000] # memory problems
#Y = Y.iloc[:10000, :]

### 2. Tokenization function to process text data

In [4]:
def tokenize(text):
    '''Function normalizes case, removes punctuation, stems, lemmatizes and parse a message into separate words.
    '''
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return tokens

### 3. Machine learning model
Since we are dealing with a multi-label problem, [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) is used.
All the messages are first turned into tf-idf matrix, without differentiating between training or testing part. This is necessary in current work flow. Otherwise, if we first split between training and testing data, then convert them respectively into tf-idf matrices, they will for sure have different number of columns, because the two parts are extremely likely to have different distributions of words! However, this seemed not to be an issue when I used pipeline previously. Not clear why this is the case. Need to dive deeper later. 

In [5]:
#Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.3) #train_size is kept small for memory reason.

vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
X_counts = vect.fit_transform(X)
X_tfidf = tfidf.fit_transform(X_counts).toarray()

X_tfidf = pd.DataFrame(X_tfidf)

In [6]:
Xtrain_tfidf, Xtest_tfidf, Ytrain, Ytest = train_test_split(X_tfidf, Y)

In [45]:
#Xtrain_tfidf.index = Xtrain.index  
# Since now the data is first transformed into tfidf matrix and then split, this step is not needed any more.
# This is extremely important!! don't forget that tfidftransformer reset the index of its output. If we don't keep
# Xtrain_tfidf's index in consistancy with Xtrain/Ytrain, the relationship between the Xtrain_tfidf and Ytrain is 
# completely destroyed! They are sent to fit the classifier later.
# A direct consequence if this step is not executed is that, in the next cell, X_sub and Ysub will have different
# number of rows. This is figured out by looking at the source code of get_minority_instace()

In [7]:
# use mlsmote to generate more minority instances
X_sub, Y_sub = mlsmote.get_minority_instace(Xtrain_tfidf, Ytrain)

In [10]:
X_res,Y_res =mlsmote.MLSMOTE(X_sub, Y_sub, 8000)

In [11]:
print(Xtrain_tfidf.shape)
print(Ytrain.shape)
print(X_sub.shape, Y_sub.shape)

(19661, 31967)
(19661, 35)
(2111, 31967) (2111, 35)


In [12]:
print(X_sub.shape[0], ' minor instances found in the data.')
print(X_res.shape[0], ' minor instances generated with mlsmote.')
print(Y_res.shape[0])

2111  minor instances found in the data.
10111  minor instances generated with mlsmote.
10111


In [49]:
# MLSMOTE once more
#X_sub1, Y_sub1 = mlsmote.get_minority_instace(X_res, Y_res)
#X_res1,Y_res1 =mlsmote.MLSMOTE(X_sub1, Y_sub1, 1000)
#print(X_sub1.shape[0], ' minor instances found in the data.')
#print(X_res1.shape[0], ' minor instances generated with mlsmote.')
#print(Y_res1.shape[0])

In [51]:
#X_sub2, Y_sub2 = mlsmote.get_minority_instace(X_res1, Y_res1)
#X_res2,Y_res2 =mlsmote.MLSMOTE(X_sub2, Y_sub2, 1000)

In [13]:
#print(X_sub2.shape[0], ' minor instances found in the data.')
#print(X_res2.shape[0], ' minor instances generated with mlsmote.')
#print(Y_res2.shape[0])

In [14]:
# combine the new generated minority-class instances with the original train data
Xtrain_tfidf = Xtrain_tfidf.append(X_res)
Ytrain = Ytrain.append(Y_res)

In [15]:
# check dimensions again
print(Xtrain_tfidf.shape)
print(Ytrain.shape)

(29772, 31967)
(29772, 35)


### 4. Train the model

In [55]:
#pipeline = Pipeline([
#    ('vect', CountVectorizer(tokenizer=tokenize)),
#    ('tfidf', TfidfTransformer()),
#    ('clf', MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=10, max_depth=5, \
#                                                                   max_features=0.4)))
#])

#pipeline.get_params()
#pipeline.fit(Xtrain, Ytrain)

In [16]:
clf = MultiOutputClassifier(estimator=RandomForestClassifier(oob_score=True, n_estimators=30, \
                                                             max_depth = 8, max_features = 0.4))
clf.fit(Xtrain_tfidf, Ytrain)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


MultiOutputClassifier(estimator=RandomForestClassifier(max_depth=8,
                                                       max_features=0.4,
                                                       n_estimators=30,
                                                       oob_score=True))

In [22]:
Xtest_tfidf.shape
Ytest.shape

(6554, 35)

In [27]:
# check performance on train data
Ytrain_pred = clf.predict(Xtrain_tfidf)
report_train = classification_report(Ytrain, Ytrain_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
print(report_train)

              precision    recall  f1-score   support

           0       0.86      0.99      0.92     25156
           1       0.89      0.43      0.58      5297
           2       1.00      0.24      0.38       231
           3       0.85      0.59      0.69     16845
           4       0.87      0.33      0.48      2598
           5       0.89      0.41      0.56      2111
           6       0.97      0.17      0.28       938
           7       0.98      0.26      0.41      1211
           8       0.88      0.33      0.48      1016
           9       0.86      0.61      0.71      2045
          10       0.81      0.77      0.79      3932
          11       0.84      0.54      0.66      3282
          12       0.95      0.72      0.82      1560
          13       1.00      0.25      0.40       593
          14       0.91      0.43      0.58       865
          15       0.87      0.36      0.51      1119
          16       0.89      0.56      0.69      1755
          17       0.95    

### 4. Test the model

In [19]:
# test on test data
Ytest_pred = clf.predict(Xtest_tfidf)

In [20]:
report = classification_report(Ytest, Ytest_pred)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print(report)

              precision    recall  f1-score   support

           0       0.79      0.98      0.88      5048
           1       0.81      0.42      0.56      1140
           2       0.00      0.00      0.00        31
           3       0.73      0.55      0.63      2744
           4       0.65      0.19      0.30       526
           5       0.69      0.24      0.35       340
           6       0.89      0.10      0.17       177
           7       0.50      0.02      0.03       120
           8       0.56      0.14      0.23       211
           9       0.80      0.59      0.68       398
          10       0.80      0.77      0.79       753
          11       0.82      0.55      0.66       598
          12       0.77      0.64      0.70       104
          13       0.57      0.05      0.09       157
          14       0.75      0.21      0.33        72
          15       0.58      0.21      0.31       225
          16       0.71      0.47      0.56       286
          17       0.65    

*Below are some historical results that are a bit obscure but I still don't want to delete yet. Readers may stop here.*

In [44]:
r1 = classification_report(Ytest.iloc[:,0], Ypred.iloc[:,0])
r1 = classification_report(Ytest['related'], Ypred['related'])

report = classification_report(Ytest, Ypred, target_names = Ytest.columns)

print(r1)
print(report)

             precision    recall  f1-score   support

          0       0.72      0.10      0.17      1547
          1       0.78      0.99      0.87      5007

avg / total       0.77      0.78      0.71      6554

                        precision    recall  f1-score   support

               related       0.78      0.99      0.87      5007
               request       0.81      0.38      0.52      1077
                 offer       0.00      0.00      0.00        34
           aid_related       0.72      0.52      0.60      2694
          medical_help       0.57      0.15      0.24       512
      medical_products       0.78      0.22      0.34       353
     search_and_rescue       0.64      0.16      0.26       178
              security       0.00      0.00      0.00       103
              military       0.66      0.12      0.20       225
           child_alone       0.00      0.00      0.00         0
                 water       0.78      0.56      0.65       445
                



Try cross validation, which is in practice too slow to convey any useful info.

In [None]:
# Note, parameter search with CV is way too slow
parameters = {
    # determine exact parameter names from pipeline.get_params()
    'vect__min_df': [0.005, 0.01]
    'vect__max_df': [0.25, 0.5, 1.0]
    #'clf__estimator__n_estimators': [100, 200],
    #'clf__estimator__max_features': ['sqrt', 'log2'],
    #'clf__estimator__min_samples_leaf': [10,20]
}

#cv = GridSearchCV(pipeline, param_grid=parameters)
#cv.fit(Xtrain, Ytrain)

In [118]:
print(r1oob)
print(reportoob)

             precision    recall  f1-score   support

          0       0.70      0.39      0.50      1562
          1       0.83      0.95      0.89      4992

avg / total       0.80      0.81      0.79      6554

                        precision    recall  f1-score   support

               related       0.83      0.95      0.89      4992
               request       0.83      0.50      0.62      1088
                 offer       0.00      0.00      0.00        23
           aid_related       0.76      0.70      0.73      2723
          medical_help       0.65      0.07      0.12       522
      medical_products       0.72      0.11      0.18       312
     search_and_rescue       0.77      0.06      0.10       179
              security       0.00      0.00      0.00       111
              military       0.64      0.08      0.15       220
           child_alone       0.00      0.00      0.00         0
                 water       0.85      0.37      0.52       425
                

Try XGBoost.

In [21]:
# Try random forests in XGBoost
pipeline_xgb = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(estimator=xgb.XGBClassifier(objective='binary:logistic', colsample_bynode=0.4,\
                                                             learning_rate=1, max_depth=5, num_parallel_tree=10,\
                                                             subsample=0.8, eval_metric='logloss',\
                                                             use_label_encoder=False)))
])
##ATTENTION: how does use_label_encoder affect the result?

In [22]:
pipeline_xgb.fit(Xtrain, Ytrain)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...e,
       use_label_encoder=False, validate_parameters=None, verbosity=None),
           n_jobs=1))])

In [23]:
pkl_filename = 'pickle_XGBRFmodel.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(pipeline_xgb, file)

In [55]:
Ypred = pipeline_xgb.predict(Xtest)

In [57]:
report = classification_report(Ytest, Ypred, target_names = Ytest.columns)
print(report)

                        precision    recall  f1-score   support

               related       0.86      0.93      0.89      5014
               request       0.75      0.60      0.67      1084
                 offer       0.00      0.00      0.00        34
           aid_related       0.76      0.70      0.73      2693
          medical_help       0.59      0.34      0.43       524
      medical_products       0.67      0.39      0.49       340
     search_and_rescue       0.57      0.22      0.32       162
              security       0.15      0.02      0.03       104
              military       0.62      0.41      0.50       205
           child_alone       0.00      0.00      0.00         0
                 water       0.77      0.67      0.71       414
                  food       0.81      0.76      0.78       693
               shelter       0.75      0.60      0.67       606
              clothing       0.76      0.46      0.57       112
                 money       0.52      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
