## Agenda
## Build ML models to classify text 'token' into categories and subcategories

In [6]:
import os 
import re 
import nltk 
import random 
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#### Uploading file genertaed in last code

In [1]:
import pandas as pd
df = pd.read_csv("CleanedData_updated_ml.csv")

In [2]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(df, train_size = 0.7)



## Part A - Classify tokens into categories

In [3]:
Train_input = train.tokens
Train_output = train.categories
Test_input = test.tokens
Test_output = test.categories

In [4]:
type(Test_output)

pandas.core.series.Series

In [11]:
Train_input.shape

(40096L,)

### Text needs to be convert into numerical form, since algorithm works on number n not on text


#### instantiate the model

In [12]:

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), max_df = 0.9, min_df = 2)

#### .fit(train) learns the vocabulary of the training data
#### .transform(train) uses the fitted vocabulary to build a document-term matrix from the training data

In [13]:
#fit the model,,,,it learns the relationsip btw features and response

X_train_dtm_sparse = tfidf_vectorizer.fit_transform(Train_input)

#### transform(on test) uses the fitted vocabulary to build a document-term matrix from the testing data (and ignores tokens it hasn't seen before)

In [14]:
# transform training data into document term matrix
X_test_dtm_sparse = tfidf_vectorizer.transform(Test_input)

In [15]:
type(X_train_dtm_sparse)

scipy.sparse.csr.csr_matrix

In [16]:
# store the vocabulary of X_train
Train_input_tokens = tfidf_vectorizer.get_feature_names()
len(Train_input_tokens)

278736

In [17]:

# examine the first 50 tokens
print(Train_input_tokens[50000:50050])

[u'cvs pharmacy alexander', u'cvs pharmacy battle', u'cvs pharmacy bickett', u'cvs pharmacy call', u'cvs pharmacy capital', u'cvs pharmacy cole', u'cvs pharmacy east', u'cvs pharmacy forest', u'cvs pharmacy garrett', u'cvs pharmacy glenwood', u'cvs pharmacy hillsborough', u'cvs pharmacy hwy', u'cvs pharmacy kildaire', u'cvs pharmacy knighdale', u'cvs pharmacy leesville', u'cvs pharmacy main', u'cvs pharmacy medication', u'cvs pharmacy nash', u'cvs pharmacy nc', u'cvs pharmacy new', u'cvs pharmacy north', u'cvs pharmacy pharmacy', u'cvs pharmacy piney', u'cvs pharmacy reason', u'cvs pharmacy retail', u'cvs pharmacy roxboro', u'cvs pharmacy rx', u'cvs pharmacy south', u'cvs pharmacy sunset', u'cvs pharmacy timber', u'cvs pharmacy tryon', u'cvs pharmacy village', u'cvs pharmacy wake', u'cvs pharmacy west', u'cw', u'cw doppler', u'cw doppler color', u'cwp', u'cwp atp', u'cwp atp follow', u'cwp atp rna', u'cwp compdl', u'cwp please', u'cwp please advise', u'cx', u'cx additional', u'cx addit

In [18]:
print(Train_input_tokens[-50:])

[u'zonegran request', u'zonegran rx', u'zonegran rx called', u'zonegran week', u'zonegran well', u'zonegran went', u'zonegran would', u'zonegran zonisamide', u'zonegran zonisamide red', u'zonergran', u'zonergran mg', u'zoning', u'zonisamide', u'zonisamide bid', u'zonisamide mg', u'zonisamide mg day', u'zonisamide please', u'zonisamide red', u'zonisamide red cpt', u'zonisamide rx', u'zonisamide say', u'zonisamide tab', u'zoniside', u'zoniside cap', u'zoniside cap po', u'zoniside mg', u'zoniside mg cap', u'zoniside mg oral', u'zoniside one', u'zoniside one po', u'zoniside po', u'zoniside po day', u'zoniside po qhs', u'zoniside rx', u'zoniside take', u'zonisimide', u'zoster', u'zoster vzv', u'zoster vzv antibody', u'zpack', u'zyprexa', u'zyprexa additional', u'zyprexa additional follow', u'zyprexa mg', u'zyprexa mg oral', u'zyprexa mg tab', u'zyprexa rx', u'zyprexa rx built', u'zyprexa start', u'zyrtec']


In [None]:
#pd.DataFrame(X_train_dtm_sparse.todense()[35:40,12735:12745], 
#             columns = tfidf_vectorizer.get_feature_names()[12735:12745])

In [None]:
#inorder to make predictions new observation must have the same features as the training obs, 
#both in number n meaning other it will give error, if the order of features is chaged,,the op will be wrong
#sckit acpets its input to be numpy array n numpy arrays are not lableled

### Building model with Logistic 

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression 

In [20]:
type(Train_output)

pandas.core.series.Series

In [21]:
glm = LogisticRegression()

%time glm.fit(X_train_dtm_sparse, np.array(Train_output))
predictions = glm.predict(X_test_dtm_sparse)

print(confusion_matrix(Test_output, predictions))
accuracy_score(Test_output, predictions)

Wall time: 23.2 s
[[3564  233    0   28  295  112]
 [ 165 2701    0   42  258  317]
 [   3    0    0    0    4    0]
 [  63   68    0  962  171   61]
 [ 285  285    0   70 2772  218]
 [ 107  274    0   28  295 3803]]


0.8031890130353817

### building model with Multinomial Naive bayes

In [22]:
nb = MultinomialNB()
%time  nb.fit(X_train_dtm_sparse, np.array(Train_output))

Wall time: 416 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
predictions = nb.predict(X_test_dtm_sparse)
print(confusion_matrix(Test_output, predictions))
accuracy_score(Test_output, predictions)

[[3265  597    0    2  272   96]
 [  93 2783    0    2  179  426]
 [   3    0    0    0    4    0]
 [ 104  462    0  324  386   49]
 [ 398  783    0    0 2246  203]
 [ 176  441    0    0  372 3518]]


0.70623836126629425

In [24]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

array([[  0.        ,   0.21032796,   0.21042853, ...,   0.        ,
          0.        ,   0.        ],
       [  0.08783627,   0.28615352,   0.28629035, ...,   0.17090275,
          0.16891245,   0.55812203],
       [  0.        ,   0.09870161,   0.0987488 , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   1.91024221,   1.91115558, ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,  25.0028521 ,  24.96286517, ...,   0.        ,
          0.        ,   0.        ],
       [  0.09436082,   1.56264573,   1.56339289, ...,   0.        ,
          0.        ,   0.37127086]])

In [25]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([  9640.,   8317.,     14.,   2996.,   8540.,  10589.])

In [26]:
nb.feature_count_.shape

(6L, 278736L)

### Modeling with Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
rf = RandomForestClassifier() # initialize

In [29]:
%time  rf.fit(X_train_dtm_sparse, np.array(Train_output)) # fit the data to the algorithm

Wall time: 1min 32s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [30]:
predictions = rf.predict(X_test_dtm_sparse)
print(confusion_matrix(Test_output, predictions))
accuracy_score(Test_output, predictions)

[[3365  341    0   20  428   78]
 [ 368 2380    0   32  355  348]
 [   0    0    2    0    5    0]
 [ 135  143    0  738  278   31]
 [ 536  465    1   77 2358  193]
 [ 208  463    0   19  423 3394]]


0.71211592178770955

#### print message text for the misclassification

In [31]:

Test_input[Test_output != predictions]

35093    cell phone call patient caller name : caller :...
27192    home phone cell phone call patient caller : pa...
5850     cell phone call patient caller name : caller :...
20112    home phone call patient patient want phone cal...
42442    cell phone call patient caller name : patient ...
31812    patient notified appointment ... kg -- -- conv...
43993    : -- call patient caller : daughter law appoin...
20161    home phone cell phone call patient caller name...
104      xxx left message patient call back schedule ap...
4204     cell phone call patient prescri patient ion \f...
3814     \margl\margr\margt\margb\headery\footery\forms...
7436     home phone cell phone call patient caller : pa...
45582    tbl n mal mshade incoming call caller name : c...
14662    home phone call patient caller : patient patie...
40661    cell phone call patient appointment oberlin rd...
27460    left message patient call back schedule appoin...
14947    home phone call patient caller name : caller :.

In [33]:
Test_input[35093]

"cell phone call patient caller name : caller : dad patient 's dad called state went pharmcy told patien'ts medication changed state impression medication changed kmr . please call back advise @ april , : rna follow-up follow-up detail : spoke dad . told lamictal hold since wanted try wait summertime try new med . dad v/u . follow-up : lori brown rn , april , : clinical list change medication updated : changed medication lictal mg chew ( lotrigine ) po bid lictal mg chew ( lotrigine ) hold - po bid - signed rx lictal mg chew ( lotrigine ) hold - po bid # tablet x signed entered : lori brown rn authorizedd : historicalplease cosign - new medical pharmacy"

## Part B - Classify tokens into sub_categories

In [34]:
Train_input1 = train.tokens
Train_output1 = train.sub_categories
Test_input1 = test.tokens
Test_output1 = test.sub_categories

In [35]:
#instantiate the model
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), max_df = 0.9, min_df = 2)

In [36]:
X_train_dtm_sparse1 = tfidf_vectorizer.fit_transform(Train_input1)

In [37]:
# transform training data into document term matrix
X_test_dtm_sparse1 = tfidf_vectorizer.transform(Test_input1)

In [38]:
type(X_train_dtm_sparse1)

scipy.sparse.csr.csr_matrix

### logistic Reg

In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression 

In [41]:
type(Train_output1)

pandas.core.series.Series

In [53]:
glm = LogisticRegression()

%time glm.fit(X_train_dtm_sparse1, np.array(Train_output1))
predictions1 = glm.predict(X_test_dtm_sparse1)

#print(confusion_matrix(Test_output1, predictions1))
accuracy_score(Test_output1, predictions1)

Wall time: 1min 12s


0.71665502793296088

### Multinomial Naive bayes

In [54]:
nb = MultinomialNB()
%time  nb.fit(X_train_dtm_sparse1, np.array(Train_output1))

Wall time: 879 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
predictions1 = nb.predict(X_test_dtm_sparse1)
#print(confusion_matrix(Test_output, predictions1))
accuracy_score(Test_output1, predictions1)

0.53171554934823095

### Random Forest

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
rf = RandomForestClassifier() # initialize

In [58]:
%time  rf.fit(X_train_dtm_sparse1, np.array(Train_output1)) # fit the data to the algorithm

Wall time: 1min 39s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [60]:
predictions1 = rf.predict(X_test_dtm_sparse1)
#print(confusion_matrix(Test_output1, predictions1))
accuracy_score(Test_output1, predictions1)

0.62383612662942267