## Combined Classifiers Solution

                                           
### Environment Setup

In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve
from nltk.corpus import stopwords
from sklearn.svm import SVC
%matplotlib inline 
import matplotlib.pyplot as plt

### Syntatic NLP Processing

#### We will define some Python functions that will perform some syntatic work on our corpus. 

In [None]:
def tokenize(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = [ token for token in tokens if re.search('(^[a-zA-Z]+$)', token) ]
    return filtered_tokens

cachedStopWords = stopwords.words("english") + ['year', 'old', 'man', 'woman', 'ap', 'am', 'pm', 'portable', 'pa', 'lat', 'admitting', 'diagnosis', 'lateral']


### Retrieving our Corpus

#### Let's pull in our corpus that we had serialized out to disk.  

In [None]:
#file = open('classification-corpus.pkl','rb')
file = open('corpus.pkl','rb')
corpus = pkl.load(file)
file.close()
corpusList = list(corpus.values())
labels = list(corpus.keys())

### Generate Document-Term Frequency Counts

#### In this step we tokenize our text and remove stop words in addition to generating our frequency counts.

#### 1) How many documents are we working with and how many features (unigrams & bigrams)?

#### 2) Can you figure out what max_df and min_df is doing to our feature count?

In [None]:
cv = CountVectorizer(lowercase=True, max_df=0.80, max_features=None, min_df=0.033,
                     ngram_range=(1, 2), preprocessor=None, stop_words=cachedStopWords,
                     strip_accents=None, tokenizer=tokenize, vocabulary=None)
X = cv.fit_transform(corpusList)
print(X.shape)
print()
lexicon = cv.get_feature_names()
print (lexicon)
print()

### Construct our Classes

#### We need to assign a class for each classification. We typically assign numeric values to classes.

In [None]:
Y = []
for key in corpus:        
    if (key.startswith('COPD') or key.startswith('CHF')):
        Y.append(0)
    elif (key.startswith('PNA')):
        Y.append(1)
Y = np.array(Y)

### Let's Run It!

#### We will generate models and evaluate the modes using bootstrapping.

In [None]:
truth = []
knn_list = [1, 5, 10]
knn_prediction= [[],[],[]]
linsvm_prediction = []
rbfsvm_prediction = []
mnb_prediction = []
dt_prediction = []
bdt_prediction = []
rf_prediction = []

for i in range(0,10):
    print('Interation: ' + str(i+1))
    N, D = X.shape
    ITB = np.random.choice(N, N, replace=True)
    X_ITB = X[ITB, :]
    Y_ITB = Y[ITB]
    X_OOB = np.delete(X.A, list(set(ITB)), 0)
    Y_OOB = np.delete(Y, list(set(ITB)), 0)
    N_OOB, D_OOB = X_OOB.shape
    truth.append(Y_OOB)
    i = 0
    for k in knn_list:
        knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', algorithm='brute', 
                                    leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1)
        knn.fit(X_ITB, Y_ITB)
        Y_hat = knn.predict(X_OOB)
        knn_prediction[i].append(Y_hat)
        i += 1
    linsvm = SVC(C=1.0, gamma='auto', kernel='linear', max_iter=-1, verbose=False)
    linsvm.fit(X_ITB, Y_ITB) 
    Y_hat = linsvm.predict(X_OOB)
    linsvm_prediction.append(Y_hat)
    rbfsvm = SVC(C=1.0, gamma='auto', kernel='rbf', max_iter=-1, verbose=False)
    rbfsvm.fit(X_ITB, Y_ITB) 
    Y_hat = rbfsvm.predict(X_OOB)
    rbfsvm_prediction.append(Y_hat)
    mnb = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
    mnb.fit(X_ITB, Y_ITB)
    Y_hat = mnb.predict(X_OOB)
    mnb_prediction.append(Y_hat)
    dt = DecisionTreeClassifier(random_state=0)
    dt.fit(X_ITB, Y_ITB)
    Y_hat = dt.predict(X_OOB)
    dt_prediction.append(Y_hat)
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=201)
    bdt.fit(X_ITB, Y_ITB)
    Y_hat = bdt.predict(X_OOB)
    bdt_prediction.append(Y_hat)
    rf = RandomForestClassifier(n_estimators = 101, oob_score = True, n_jobs = -1, max_features = "auto")
    rf.fit(X_ITB, Y_ITB)
    Y_hat = rf.predict(X_OOB)
    rf_prediction.append(Y_hat)


truth = np.concatenate(truth, axis=0)    
for i in range(0, len(knn_prediction)):
    knn_prediction[i] = np.concatenate(knn_prediction[i], axis=0)
linsvm_prediction = np.concatenate(linsvm_prediction, axis=0)
rbfsvm_prediction = np.concatenate(rbfsvm_prediction, axis=0)
mnb_prediction = np.concatenate(mnb_prediction, axis=0)
dt_prediction = np.concatenate(dt_prediction, axis=0)
bdt_prediction = np.concatenate(bdt_prediction, axis=0)
rf_prediction = np.concatenate(rf_prediction, axis=0)

### Contingency Tables

#### Let's look at the contingency tables

In [None]:
for i in range(0, len(knn_prediction)):
    ct = pd.crosstab(knn_prediction[i], truth, margins=True)
    ct.columns = ["Other", "PNA", "Total"]
    ct.index = ["Other", "PNA", "Total"]
    print('KNN (K = %d)' % knn_list[i])
    print(ct)
    print()
    Sens = ct.iloc[1][1]/ct.iloc[2][1]
    Spec = ct.iloc[0][0]/ct.iloc[2][0]
    PPV = ct.iloc[1][1]/ct.iloc[1][2]
    NPV = ct.iloc[0][0]/ct.iloc[0][2]
    ACC = (ct.iloc[0][0] + ct.iloc[1][1]) / ct.iloc[2][2]
    print("Sensitivity: %.5f Specificity: %.5f PPV: %.5f NPV: %.5f Accuracy: %.5f" % (Sens, Spec, PPV, NPV, ACC))
    print()
    print()

ct = pd.crosstab(linsvm_prediction, truth, margins=True)
ct.columns = ["Other", "PNA", "Total"]
ct.index = ["Other", "PNA", "Total"]
print('Linear SVM')
print(ct)
print()
Sens = ct.iloc[1][1]/ct.iloc[2][1]
Spec = ct.iloc[0][0]/ct.iloc[2][0]
PPV = ct.iloc[1][1]/ct.iloc[1][2]
NPV = ct.iloc[0][0]/ct.iloc[0][2]
ACC = (ct.iloc[0][0] + ct.iloc[1][1]) / ct.iloc[2][2]
print("Sensitivity: %.5f Specificity: %.5f PPV: %.5f NPV: %.5f Accuracy: %.5f" % (Sens, Spec, PPV, NPV, ACC))
print()
print()

ct = pd.crosstab(rbfsvm_prediction, truth, margins=True)
ct.columns = ["Other", "PNA", "Total"]
ct.index = ["Other", "PNA", "Total"]
print('RBF Kernel SVM')
print(ct)
print()
Sens = ct.iloc[1][1]/ct.iloc[2][1]
Spec = ct.iloc[0][0]/ct.iloc[2][0]
PPV = ct.iloc[1][1]/ct.iloc[1][2]
NPV = ct.iloc[0][0]/ct.iloc[0][2]
ACC = (ct.iloc[0][0] + ct.iloc[1][1]) / ct.iloc[2][2]
print("Sensitivity: %.5f Specificity: %.5f PPV: %.5f NPV: %.5f Accuracy: %.5f" % (Sens, Spec, PPV, NPV, ACC))
print()
print()

ct = pd.crosstab(mnb_prediction, truth, margins=True)
ct.columns = ["Other", "PNA", "Total"]
ct.index = ["Other", "PNA", "Total"]
print('Multinomial Naive Bayes')
print(ct)
print()
Sens = ct.iloc[1][1]/ct.iloc[2][1]
Spec = ct.iloc[0][0]/ct.iloc[2][0]
PPV = ct.iloc[1][1]/ct.iloc[1][2]
NPV = ct.iloc[0][0]/ct.iloc[0][2]
ACC = (ct.iloc[0][0] + ct.iloc[1][1]) / ct.iloc[2][2]
print("Sensitivity: %.5f Specificity: %.5f PPV: %.5f NPV: %.5f Accuracy: %.5f" % (Sens, Spec, PPV, NPV, ACC))
print()
print()

ct = pd.crosstab(dt_prediction, truth, margins=True)
ct.columns = ["Other", "PNA", "Total"]
ct.index = ["Other", "PNA", "Total"]
print("Decision Tree")
print(ct)
print()
Sens = ct.iloc[1][1]/ct.iloc[2][1]
Spec = ct.iloc[0][0]/ct.iloc[2][0]
PPV = ct.iloc[1][1]/ct.iloc[1][2]
NPV = ct.iloc[0][0]/ct.iloc[0][2]
ACC = (ct.iloc[0][0] + ct.iloc[1][1]) / ct.iloc[2][2]
print("Sensitivity: %.5f Specificity: %.5f PPV: %.5f NPV: %.5f Accuracy: %.5f" % (Sens, Spec, PPV, NPV, ACC))
print()
print()

ct = pd.crosstab(bdt_prediction, truth, margins=True)
ct.columns = ["Other", "PNA", "Total"]
ct.index = ["Otrher", "PNA", "Total"]
print("Boosting Decision Stumps")
print(ct)
print()
Sens = ct.iloc[1][1]/ct.iloc[2][1]
Spec = ct.iloc[0][0]/ct.iloc[2][0]
PPV = ct.iloc[1][1]/ct.iloc[1][2]
NPV = ct.iloc[0][0]/ct.iloc[0][2]
ACC = (ct.iloc[0][0] + ct.iloc[1][1]) / ct.iloc[2][2]
print("Sensitivity: %.5f Specificity: %.5f PPV: %.5f NPV: %.5f Accuracy: %.5f" % (Sens, Spec, PPV, NPV, ACC))
print()
print()

ct = pd.crosstab(rf_prediction, truth, margins=True)
ct.columns = ["Other", "PNA", "Total"]
ct.index = ["Other", "PNA", "Total"]
print("Random Forest")
print(ct)
print()
Sens = ct.iloc[1][1]/ct.iloc[2][1]
Spec = ct.iloc[0][0]/ct.iloc[2][0]
PPV = ct.iloc[1][1]/ct.iloc[1][2]
NPV = ct.iloc[0][0]/ct.iloc[0][2]
ACC = (ct.iloc[0][0] + ct.iloc[1][1]) / ct.iloc[2][2]
print("Sensitivity: %.5f Specificity: %.5f PPV: %.5f NPV: %.5f Accuracy: %.5f" % (Sens, Spec, PPV, NPV, ACC))
print()
print()


### ROC Curve

#### 1) So which classifier do you think is better?

In [None]:
knn1_fpr, knn1_tpr, knn1_thresholds = roc_curve(truth, knn_prediction[0], pos_label=1)
knn5_fpr, knn5_tpr, knn5_thresholds = roc_curve(truth, knn_prediction[1], pos_label=1)
knn10_fpr, knn10_tpr, knn10_thresholds = roc_curve(truth, knn_prediction[2], pos_label=1)
linsvm_fpr, linsvm_tpr, linsvm_thresholds = roc_curve(truth, linsvm_prediction, pos_label=1)
rbfsvm_fpr, rbfsvm_tpr, rbfsvm_thresholds = roc_curve(truth, rbfsvm_prediction, pos_label=1)
dt_fpr, dt_tpr, dt_thresholds = roc_curve(truth, dt_prediction, pos_label=1)
bdt_fpr, bdt_tpr, bdt_thresholds = roc_curve(truth, bdt_prediction, pos_label=1)
rf_fpr, rf_tpr, rf_thresholds = roc_curve(truth, rf_prediction, pos_label=1)
mnb_fpr, mnb_tpr, mnb_thresholds = roc_curve(truth, mnb_prediction, pos_label=1)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(knn1_fpr, knn1_tpr, label='KNN-1')
plt.plot(knn5_fpr, knn5_tpr, label='KNN-5')
plt.plot(knn10_fpr, knn10_tpr, label='KNN-10')
plt.plot(dt_fpr, dt_tpr, label='DecisionTree')
plt.plot(bdt_fpr, bdt_tpr, label='BoostingDecisionStumps')
plt.plot(rf_fpr, rf_tpr, label='RandomForest')
plt.plot(linsvm_fpr, linsvm_tpr, label='Linear SVM')
plt.plot(rbfsvm_fpr, rbfsvm_tpr, label='RBF SVM')
plt.plot(mnb_fpr, mnb_tpr, label='Multi NB')
plt.xlabel('False positive rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()