In [48]:
# 02/14/2018
#Classify EDI Incidents, within Visibility, into Tier

#load data from db (file)
#data prep, EDA
#fit model
#predict (future!)

In [49]:
#list of libraries

import sys #used in error handling
import time #to track performance time
import numpy as np
import pandas as pd
import pyodbc

%matplotlib inline
import matplotlib.pyplot as plt #to plot data

#part1 - build model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer #to tokenize dataset, aka bag of words activity
from sklearn.feature_extraction import text #to use stop_words
from sklearn.naive_bayes import MultinomialNB #to run Naive Bayes algorithm for text classification
from sklearn.pipeline import Pipeline #streamline Tokenizing + Classification model
from sklearn import metrics # to evaluate model

#part2 - grid search
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.externals import joblib #to pickle the model


In [50]:
#----------------------------------------------- set CONSTANTS and variables -----------------------------------------------
#prompt to select verbose mode to view progress details or not
while True:
    inp = input("Verbose mode: Y or N? ")
    if inp.lower() not in ('y','n'):
         print("Please enter valid verbose mode: Y or N? ")
    else:
        fn_verbose = lambda x: 0 if x=='N' else 1
        input_verbose_flg=fn_verbose(inp.upper())
        break

#----------------------------------------------- DEFINE GLOBAL FUNCTIONS -----------------------------------------------
### define functions to classify incidents

def fn_classify(x):
    if x !='Tier 1' and x !='Tier 2' and x !='Tier 3':
        return 'not'
    else:
        return x

#reclassify Y output to numeric for SGD (lambda function)
#output aka categories -> data.target equivalent
fn_classify_numeric = lambda x: 0 if x=='Tier 1' else 1 if x=='Tier 2' else 2 if x=='Tier 3' else 3

#reclassify Y numeric output to text from SGD (lambda function)
categories=np.array(['Tier 1','Tier 2','Tier 3','not'])
fn_classify_text = lambda x: categories[x] #doen's work with -1,0,1,2 list.  not becomes 0 which is incorrect
#fn_classify_text = lambda x: "not" if x==-1 else 'Billing' if x==0 else 'OrderMgmt' if x==1 else 'Transportation'


Verbose mode: Y or N? y


## EDI Incidents Queue Classification

### Abstract
This code demonstrates how to build a prediciton model for multiclass text classification.  The goal is to predict queue category (Billing, Order Management, Transportation, and not) given the text contents of the submitted EDI incidents.  

Key libraries:  
a Naive Bayes algorithm from Scikit-learn library. 


### 0. Load Data
Load data from Service Desk database



In [51]:
# --------------------------------------- Step 0. Load Data ------------------------------------------------------------------
# Load data directly from Service Desk database into pandas dataframe
# this query includes an index column, some categorical columns, some free text columns + target column
# -------------------------------------------------------------------------------------------------------------------------------

print("\nStep0: Loading data from file...")

#track performance time
t0=time.time()

data_all=pd.read_csv('dummy.xlsx',index_col=0, delimiter=",", encoding = "ISO-8859-1" , error_bad_lines=False,
                 lineterminator='\n')

print ("All data: {} ".format(data_all.shape))
data_all.head()


Step0: Loading data from file...
All data: (20, 3) 


b'Skipping line 3: expected 4 fields, saw 5\nSkipping line 18: expected 4 fields, saw 5\n'


Unnamed: 0_level_0,à¥ÇDO97*~§ÈÉ¸8ÀOíc|n¦ÑäEøÿöéºóÀBÉÀ!$}íàÈé;{ìÐå[îñé2þ,"8æø(%£¦""Dô¹Ò4jÎ0u2jsÐÊMYÞË´äúSì­´·· )fåÿ¹CÛöyÑó	I< y",ËôïfäÉÇÕß
PK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
*&$Ú#Äú-j½_¨¬©¸´{ÍKf¾)Dm:´*{\Åô¦©ýLU5@¬D)ì}¼*Ïn¥ÒlUBØ;Ò÷v®þÃvÀôÊU%2­ZÛSö¤_ÅOp@È³ì^çàmH4ÐüN¸Yéè/YEG¬èàF# ­V+1$ï/ÑúGn!×¢äß÷ÒõX]e«T¼»Ìåy°U[þìnêY#J°ÜÃ}Lr¾Ð^Î×¬)í¹£h¶O0¦¥åZ2ËçJZÐá!®ÕÜdØóBÂ½oüg#4Æ}A¬0²,f+sÁlá5ºLPpm xè8­ÇQ2Xp³±ª¨½n?Ð'Ë\z#³ýú...,,
"ãû]Ä6KrèÒ¹3=vØP<sÚLÈÄ~&!EwI|Ù;D=CP²1Ü·\t¶Âýf""¸äj'úeÎ±¼½t°eÚ<¶ØµÍ3;:ó©Ú»StÆ{·>sXÐa3Ëç¹ÑW""`ìJ¬+ÈÎUõ`eªkÖ)r+e÷ñm°goqx(ß¤ùDÝJ]8åTzMà5åäÓ)×è0»¿IëYgzî|]p+~o³Ç`_Þ=í¾|j ö·öÍQk<a",,,
Ýþ\D«ZlîØ6FV½äÅÏ²'üwÊws[ñû:(eçD³\t÷,kzhÜÀp¬sÖyUs^ÕøÿûªfÓ^>¯eÎkóZÆõöõAj¼|Ê&ïòè...,Ü#-ãq&?'2ÚÐZCeÝÀLõTx3& c¤u+Ð­ûNóxÓNg...,
0mxåöàE½åAÚAfçc§´¼®,,,
ÎFz3©Pb/3 tSÙºqyjuiª½E¤-#t³0Ò0á,";ÍûYÆºÔ2O¹b¹r3êkE""'¸&&SÐÄ;nùµj·*#4kù...",OÔ6ømuF8=ñ®'?ÈÌZu@,ëJøúÊÜ¼Õfw<zp8§RèPBo#(úÒÈ6`ÜYß¼9'-ÿ...


### 1. Data Prep
Build a learning set

In [52]:
print("\nStep1: Creating a tidy dataset...")

# ------------------------Step 1. Data Prep-------------------------------------------------------------------------------------------
# prepare a tidy dataset
# -------------------------------------------------------------------------------------------------------------------------------
#Ready for tidy dataset
df_tidy=data_all.copy()
df_tidy.shape



Step1: Creating a tidy dataset...


(20, 3)

In [53]:
#check for duplicate records: varies by project 
print ("Found duplicates: {:d} ".format(sum(df_tidy.duplicated(keep=False))))

#if duplicates found, examine them 
duplicates = df_tidy.duplicated(keep=False)
df_tidy[duplicates].index

df_tidy.loc[data_all[duplicates].index]

#after duplicate analysis, decide what to do with them: keep or drop
#df_tidy.drop_duplicates

#No duplicates found here, based on unique index by Incident ID; 

Found duplicates: 11 


ValueError: cannot index with vector containing NA / NaN values

In [None]:
#select relevant columns: varies by project
#chose option 1: load many columns from db and filter out here
print ("Select relevant columns: ")

#alternative syntax, select columns by number
#tidy_include = [df[7:10]]
tidy_include = ['Summary','Description','Urgency','ReasonForUrgency','AffectedService', 'Tier']

df_tidy = df_tidy[tidy_include]
print (df_tidy.shape)
df_tidy.head()

In [None]:
#check for missing values
#i.e. records with neither summary nor descripton, i.e. 578445

print ("\nCheck for missing values in Description:{:d} ".format(sum(df_tidy['Description'].isnull())))
print ("Check for missing values in Summary: {:d} ".format(sum(df_tidy['Summary'].isnull())))
print ("Check for missing values in both Description and Summary: {:d} ".format(sum(df_tidy['Summary'].isnull() & df_tidy['Summary'].isnull()))) 

#filter out records with missing data
if sum(df_tidy['Description'].isnull() & df_tidy['Summary'].isnull()):
    df_tidy=df_tidy[df_tidy['Description'].notnull() | df_tidy['Summary'].notnull()]
    print ("Filtered data -> removed records with missing data:{} ".format(df_tidy.shape))
    
#plot distribution of categories
df_tidy['Tier'].value_counts().plot.bar()

In [None]:
print ("Perform feature engineering: concatentate text columns")
#after checking for missing data, concatenate relevant fields (Description, Summary, etc) into one text field

#doesn't work when no 'Description'
#data['all_text'] = data['Summary'].str.cat(data['Description'], sep='__HERE__')

df_tidy['all_text'] = df_tidy['Summary'].fillna('') + df_tidy['Description'].fillna('') + df_tidy['Urgency'].fillna('')+ df_tidy['ReasonForUrgency'].fillna('')
    
print ("Total records after concatenation:{} ".format(df_tidy.shape))

#### Dependent Variable, aka Target
Choose a target variable and format it to serve as a classifying "label"

In [None]:
#ensure target variable is categorical
df_tidy['Tier']=df_tidy['Tier'].astype('category')
df_tidy.dtypes

In [None]:
#examine target variable

#how many categories total?
print ("Number of categories: {:d} ".format(len(df_tidy['Tier'].cat.categories)))
print (df_tidy['Tier'].cat.categories) 
#alternative way to count categories
#df_tidy_grpByCategory = df_tidy.groupby(['AffectedService']).size()
#print "Number of categories: ", df_tidy_grpByCategory.shape[0]


### Summary: Data Prep Summary


decide to include some columns but not others
determine what to do with missing values, i.e. nulls and NAs
perform feature engineering, i.e. combine all text fields into one
define final class "labels", i.e. focus on most occuring categories

Tidy data ->
- verified no duplicates
- selected relevant columns: 'Summary','Description','Urgency','ReasonForUrgency','AffectedService'
- dropped records with missing data
- performed feature engineering: concatenated text fields into 'all_text' column
- defined a target variable: 'AffectedService' column 

Ready for next step: where tidy data will be transformed into format ready for ML: X (matrix) and y (vector)


In [None]:
# look at all incidents by category  
print ("Total EDI incidents: {}".format(df_tidy.shape[0]))
print ("Number of categories: {:d}".format(len(df_tidy['Tier'].cat.categories)))
print ("Categories: {}".format(df_tidy['Tier'].cat.categories))


### Ready to proceed to ML!

### 1c. Prep Data for ML
Assemble Data into ML Expected Format.  Scikit-learn expects a Numpy array-like structure. Transform the tidy dataset to a structure acceptable by algorithm: 
- input features X(matrix) and 
- target variable y(vector). 

X - column 'all_text'  
y - column 'AffectedService', reformatted with values  'Billing', 'OrderMgmt', 'Transportation', and 'not'.

Split data into train and test sets.

In [None]:
# --------------------------------------- Step 1c. Prep Data for ML ----------------------S----------------------------------------
print ("\nStep1c: Prepping data for ML...")
start = time.time()

columns_selected=['all_text','Tier']
data=df_tidy[columns_selected]

#rename columns to fit into ML text classification
data.columns = ['text', 'class']
data['class_multi']=data['class'].apply(fn_classify)
print ("Filtered data: {}".format(data.shape))

#create data.data and data.target equivalents without converting to a list
X=data.text
y=data.class_multi

In [None]:
### Train/Test Split
# Randomly split data into two groups: a training set and a validation set
# It's important not to touch the test set when building a classifier. 
#Therefore, we separate X&y into two sets: for training the model and for testing the model accuracy.  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30)

#done - print summary 
#sptodo: print "Done in %0.3fs" % (end - start)    
if input_verbose_flg ==1:
    print ("\nTotal records and fields in learning set: {}".format(data.shape[0]))
    print ("Total files in training set: {}".format(len(X_train)))
    print ("Total files in test set: {}".format(len(X_test)))

In [None]:
# Look for all dataframes
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
print(alldfs) # df1, df2


# RELEASE MEMORY
lst = [df,df_tidy]
del df
del df_tidy
del lst   

### 2. Classifying Incidents - Train Model

In [None]:
# ----------------------------------- Step 2. Classifying Incidents - Train Model -----------------------------------------------
# algorithm: MultinomialNB or SGDClassifier
# do grid search to find best hyper-parameters (based on SGD example) 
# note: MultinomialNB takes categories as is, no need to convert to int)
# note: SGDClassifier can't process categories as is, an extra setp to to convert to int)

print ("\nParameters for training a classifier model...")

# prompt to select a classifier
while True:
    inp = input("Select learning algorithm: (NB)-Multinomial Naive Bayes or (SGD)-Stochastic Gradient Descent: ")
    if inp.lower() not in ('nb','sgd'):
         print ("Please select a valid algorithm: (NB)-Multinomial Naive Bayes or (SGD)-Stochastic Gradient Descent ")
    else:
        input_algorithm=inp.upper()
        break
        
# prompt to select unigrams or +bigrams
while True:
    try:
        input_ngram = int(input("Select (1)-unigrams only or (2)-unigrams and bigrams: ")) 
    except ValueError:
        print ("Sorry I didn't understand that")
        continue
    else:
        #file size successfully parsed
        if input_ngram < 1 or input_ngram > 2:
            print ("Please select a valid n-gram range: (1)-unigrams only or (2)-unigrams and bigrams")
        else:
            break

# prepare hyper-parameters for grid search
# Our classifier has a few hyper-parameters. The two most important are:

# The alpha keyword in the Bayesian classifier is a "smoothing parameter" -- increasing the value decreases the sensitivity to any single feature, and tends to pull prediction probabilities closer to 50%.
alphas = (.001, 0.0001,0.00001) #(.001, .01, .1, 1) #sptodo try smaller (0.00001, 0.000001) #was  (.001, .01, .1, 1, 5, 10)
# The min_df keyword in CountVectorizer, which will ignore words which appear in fewer than min_df fraction of reviews. Words that appear only once or twice can lead to overfitting, since words which occur only a few times might correlate very well with document classes by chance in the training dataset.
min_dfs = (1e-5, 1e-4)
max_dfs = (0.7, 0.9)
# ngrams - unigrams only or unigrams and bigrams
if input_ngram==1:
    ngram_range=[(1,1)]
elif input_ngram==2:
    ngram_range=[(1, 1), (1, 2)]
else:
    ngram_range=-1 
    
#if want to use stop_words
#type(text.ENGLISH_STOP_WORDS)
my_additional_stop_words=['hi','hello','dear','helpdesk']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
#print (my_stop_words)
            
# prepare pipeline for grid search
if input_algorithm =='NB':
    #the grid of parameters to search over
    parameters = {
        'vect__min_df': min_dfs,
        'vect__max_df': max_dfs,
        'vect__ngram_range': ngram_range,
        #'tfidf__use_idf': (True, False),
        'clf__alpha': alphas
    }
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(stop_words=my_stop_words)),
        ('tfidf', TfidfTransformer()),
        ('clf',  MultinomialNB()),
    ])
elif input_algorithm=='SGD':
    #the grid of parameters to search over
    parameters = {
        'vect__min_df': min_dfs,
        'vect__max_df': max_dfs,
        'vect__ngram_range': ngram_range,
        #'tfidf__use_idf': (True, False),
        'clf__penalty': ('l2', 'elasticnet'),
        'clf__alpha': alphas
        }
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(stop_words=my_stop_words)),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])
    
    #reclassify Y output to numeric for SGD
    y_train_backup=y_train
    y_test_backup=y_test #save and reassign at the end, so this section can be run again without erroring out
    y_train=np.array(y_train.apply(fn_classify_numeric))
    y_test=np.array(y_test.apply(fn_classify_numeric))

# prepare grid search - find the best parameters for both the feature extraction and the classifier
# n_jobs=-1 grid search will detect how many cores are installed and uses them all
# cv defaults to 3 folds
classifier_grid = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=input_verbose_flg) 
if input_algorithm =='NB':
    print ("\nStep2: Training a Naive Bayes model...")
elif input_algorithm =='SGD':
    print ("\nStep2: Training a Stochastic Gradient Descent model...")

# ********************************     FIT MODEL  *****************************************************************************#
t0 = time.time()
print ("Performing grid search... (this may take up to 10 minutes)")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:", parameters)

classifier_grid.fit(X_train, y_train)

#done - print summary 
t1 = time.time()
print ("Done in %0.3fs" % (t1 - t0))

best_score = classifier_grid.best_score_
best_parameters = classifier_grid.best_estimator_.get_params()

print ("Best score: %0.3f" % best_score)
print ("Best parameters set:")
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, best_parameters[param_name]))
        
print("\nProcessing time to fit model (in min): ", (t1 - t0)/60)


### 3. Evaluate Model Performance

In [None]:
# ----------------------------------------------- 3. Evaluate Model Performance -----------------------------------------------
### 3a. Evaluate High Level Accuracy
# run on a test set  
# Report the accuracy of this model on both the training and testing data.  Are results comparable?

print ("\nStep3: Evaluating model performance...")

y_hat=classifier_grid.predict(X_test)

#done - print summary 
end=time.time()

accuracy_train=classifier_grid.score(X_train, y_train)
accuracy_test=classifier_grid.score(X_test, y_test)

print ("Done in %0.3fs" % (end - start))
print ("\nAccuracy on trainset: %0.4f " % accuracy_train)
print ("Accuracy on cv set: %0.4f (aka best score from grid search)" % best_score )
print ("Accuracy on testset: %0.4f " %  accuracy_test)

### 3c. More Analysis on the Model Accuracy:
if input_verbose_flg == 1:
    print ("Total files and fields in testset:", len(X_test))
    
#confusion matrix
#from sklearn import metrics
    print ("Classified categories: ", best_parameters.get('clf').classes_)
    print ("\nConfusion Matrix" )
    print(metrics.confusion_matrix( y_test, y_hat))

print ("\nClassification Report")
print (metrics.classification_report(y_test, y_hat))

print ("Model built.  Ready to predict.")


### Pickle the Model

In [None]:
joblib.dump(classifier_grid, "models/model_Classify.pkl")