In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from NLPutils import NLPutils as NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.snowball import EnglishStemmer # load the stemmer module from NLTK
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer
import pickle
import math
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import py_trees
#import behaviours as be
from py_trees.blackboard import Blackboard
from scipy import spatial
from pandas import DataFrame
import re
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
class RAAdata(object):
    def __init__(self,text,vital,inter):
        self.text = text
        self.vital = vital
        self.inter = inter
        
def load_RAA_data(path, cv = True): 
    df = pd.read_excel(path)
    interset = set()
    interdict = dict()
    narratives = df['Narrative']
    narratives = [i for i in narratives]
    inters = df['Interventions']
    vitals = df['Vitals']
    vitals = [i for i in vitals]
    interventions = []
    for item in inters:
        inter = item.strip('{}').split('}{')
        inter = [i.split(':')[-1].strip().lower() for i in inter]
        c_int = []
        for j in inter:
            interset.add(j)
            if j in interdict:
                interdict[j] += 1
            else:
                interdict[j] = 1
            c_int.append(j)
        interventions.append(c_int)
    for inter in list(interdict):
        if cv and interdict[inter] < 20: del interdict[inter]
    data = [RAAdata(item,vitals[idx],interventions[idx]) for idx,item in enumerate(narratives)]
    
    return data,interdict

def fullmatch(regex, string, flags=0):
    """Emulate python-3.4 re.fullmatch()."""
    return re.match("(?:" + regex + r")\Z", string, flags=flags)

# preprocess utils
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

def weighted_precision_recall_f1_util (y_test, y_pre, weight = None):
    tp, fp, fn = [0. for _ in range(len(y_pre[0]))], [0. for _ in range(len(y_pre[0]))], \
    [0. for _ in range(len(y_pre[0]))]
    for idx in range(len(y_pre)):
        for i in range(len(y_pre[idx])):
            if y_pre[idx][i] == 1 and y_test[idx][i] == 1: tp[i] += 1
            elif y_pre[idx][i] == 1 and y_test[idx][i] == 0: fp[i] += 1
            elif y_pre[idx][i] == 0 and y_test[idx][i] == 1: fn[i] += 1
    precision = [tp[i] / (tp[i] + fp[i]) if tp[i] > 0 or fp[i] > 0 else 0. for i in range(len(tp))]
    recall = [tp[i] / (tp[i] + fn[i]) if tp[i] > 0 or fn[i] > 0 else 0. for i in range(len(tp))]
    f1 = [2 * precision[i] * recall[i] / (precision[i] + recall[i]) \
         if precision[i] > 0 or recall[i] > 0 else 0. for i in range(len(tp))]
    return np.average(precision, weights = weight), np.average(recall, weights = weight), \
np.average(f1, weights = weight)

def weighted_precision (y_test, y_pre, weight = None):
    precision, _, _ = weighted_precision_recall_f1_util (y_test, y_pre, weight)
    return precision

def weighted_recall (y_test, y_pre, weight = None):
    _, recall, _ = weighted_precision_recall_f1_util (y_test, y_pre, weight)
    return recall

def weighted_f1 (y_test, y_pre, weight = None):
    _, _, f1 = weighted_precision_recall_f1_util (y_test, y_pre, weight)
    return f1

def show_results(scores):
    metrics = ['test_precision_weighted','test_recall_weighted', 'test_f1_weighted',\
            'test_precision_micro', 'test_recall_micro', 'test_f1_micro']
    for metric in metrics:
        print metric + ':' + '%.2f' % np.average(scores[metric])
        
def risk_factor(gt, probs, preds):
    risk = []
    for idx,case in enumerate(probs):
        r = 0
        for i,prob in enumerate(case):
            if preds[idx][i] == 1 and gt[idx][i] == 0:
                r += prob * int2fp_score[num2int[i]] / sum(gt[idx])
            if preds[idx][i] == 0 and gt[idx][i] == 1:
                r += prob * int2fn_score[num2int[i]] / sum(gt[idx])
        risk.append(r)
    return sum(risk) / len(risk)

def trans_prob(probs):
    transed_prob = [[0.] * len(probs) for _ in range(len(probs[0]))]
    for idx, res in enumerate(probs):
        for i, p in enumerate(res):
            if len(p) < 2: transed_prob[i][idx] = 1. - p[0]
            else: transed_prob[i][idx] = p[1]
                
    return transed_prob

def show_test_results(gt, res, prob, class_weight):
    print "precision_micro" + ':' + '%.2f' % precision_score(gt, res, average = 'micro')
    print "recall_micro" + ':' + '%.2f' % recall_score(gt, res, average = 'micro')
    print "f1_micro" + ':' + '%.2f' % f1_score(gt, res, average = 'micro')
    print "precision_weighted" + ':' + '%.2f' % weighted_precision(gt, res, class_weight)
    print "recall_weighted" + ':' + '%.2f' % weighted_recall(gt, res, class_weight)
    print "f1_weighted" + ':' + '%.2f' % weighted_f1(gt, res, class_weight)
    print "risk_factor" + ':' + '%.4f' % risk_factor(gt, prob, res)
    
def filtering(res, prob, threshold):
    for idx, case in enumerate(res):
        for i in range(len(case)):
            if prob[idx][i] < threshold:
                res[idx][i] = 0
                prob[idx][i] = 0.
    return res, prob

In [3]:
# read labeled cases
docu = './RAA_train.xlsx'
df = pd.read_excel(docu)
train_narratives = df['Narrative']

In [4]:
docu = './RAA_1000_test.xlsx'
df = pd.read_excel(docu)
test_narratives = df['Narrative']

In [6]:
##SKIP##with open('test_vec_list.txr','w') as fo:
    pickle.dump(test_vec, fo)

NameError: name 'test_vec' is not defined

In [10]:
##SKIP##with open('test_vec_list.txr') as fo:
    test_vec = pickle.load(fo)

IndentationError: unexpected indent (<ipython-input-10-94f7e295b585>, line 2)

In [11]:
##SKIP##len(test_vec)

In [6]:
narra,intdict = load_RAA_data('./RAA_train.xlsx')
test_narra, _ = load_RAA_data('./RAA_1000_test.xlsx', cv = False)
risk_route = './Intervention Safety Sheet.xlsx'
df_risk = pd.read_excel(risk_route)
int2fn_score = dict()
int2fp_score = dict()
for row in df_risk.iterrows():
    name = row[1]['Intervention'].split('\'')[1]
    FN_score, FP_score = 0, 0
    if not pd.isnull(row[1]['If NOT Done When Indicated']):
        FN_score = int(row[1]['If NOT Done When Indicated'])
    if not pd.isnull(row[1]['If Done When NOT Indicated']):
        FP_score = int(row[1]['If Done When NOT Indicated'])
    if not FN_score or not FP_score or (name not in intdict):
        continue
    int2fn_score[name] = FN_score
    int2fp_score[name] = FP_score
int2num = dict()
num2int = dict()
for i,key in enumerate(int2fn_score):
    int2num[key] = i
    num2int[i] = key
#n = NLP()
# load technical n-grams
fo = open('ngrams.txt')
ngrams = set()
for line in fo:
    if line == '\n': continue
    ngrams.add(line.strip('\n'))
fo.close()

In [7]:
inter_safety = [dict() for _ in range(len(int2num))]
for idx in range(len(num2int)):
    inter_safety[idx][0] = 1. / int2fn_score[num2int[idx]]
    inter_safety[idx][1] = 1. / int2fp_score[num2int[idx]]
inter_safety_dic = dict()
for idx in range(len(num2int)):
    inter_safety_dic[idx] = int2fp_score[num2int[idx]]

In [8]:
total_text = [i.text for i in narra]
total_inter = [i.inter for i in narra]

In [9]:
train,_ = train_test_split(narra, random_state=46, test_size=.2, shuffle=True)
train_text = [i.text for i in train]
train_inter = [i.inter for i in train]
test_text = [i.text for i in test_narra]
test_inter = [i.inter for i in test_narra]

In [10]:
def preprocess(text):
    text = text.lower()
    text = cleanPunc(text)
    text = keepAlpha(text)
    text = stemming(text)
    return text

vectorizer = TfidfVectorizer(ngram_range=(1,1), preprocessor = preprocess, stop_words = 'english', norm='l2')
vectorizer.fit(train_text)
x_train = vectorizer.transform(train_text)
y_train = [[int2num[inter] for inter in case if inter in int2num] for case in train_inter]
encoded_y_train = np.array([[int(num in case) for num in range(len(int2num))] for case in y_train])
x_test = vectorizer.transform(test_text)
y_test = [[int2num[inter] for inter in case if inter in int2num] for case in test_inter]
encoded_y_test = np.array([[int(num in case) for num in range(len(int2num))] for case in y_test])

  'stop_words.' % sorted(inconsistent))


In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), preprocessor = preprocess, stop_words = 'english', norm='l2')
vectorizer.fit(total_text)
x_total = vectorizer.transform(total_text)
y_total = [[int2num[inter] for inter in case if inter in int2num] for case in total_inter]
encoded_y_total = np.array([[int(num in case) for num in range(len(int2num))] for case in y_total])
class_weight = np.sum(encoded_y_total, axis = 0)

In [12]:
vectorizer = TfidfVectorizer(ngram_range=(1,4),vocabulary = ngrams,\
                             preprocessor = preprocess, stop_words = 'english', norm='l2')
vectorizer.fit(train_text)
x_train_ngram = vectorizer.transform(train_text)
y_train = [[int2num[inter] for inter in case if inter in int2num] for case in train_inter]
encoded_y_train = np.array([[int(num in case) for num in range(len(int2num))] for case in y_train])
x_test_ngram = vectorizer.transform(test_text)
y_test = [[int2num[inter] for inter in case if inter in int2num] for case in test_inter]
encoded_y_test = np.array([[int(num in case) for num in range(len(int2num))] for case in y_test])

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,4),vocabulary = ngrams,\
                             preprocessor = preprocess, stop_words = 'english', norm='l2')
vectorizer.fit(total_text)
x_total_ngram = vectorizer.transform(total_text)
y_total = [[int2num[inter] for inter in case if inter in int2num] for case in total_inter]
encoded_y_total = np.array([[int(num in case) for num in range(len(int2num))] for case in y_total])

In [14]:
scoring = {'precision_weighted': make_scorer(weighted_precision, weight = class_weight),
           'recall_weighted': make_scorer(weighted_recall, weight = class_weight),
           'f1_weighted': make_scorer(weighted_f1, weight = class_weight),
           'precision_micro': 'precision_micro',
           'recall_micro': 'recall_micro',
           'f1_micro': 'f1_micro'}

In [2]:
##SKIP for now##
###look into incremental training#
#Look into sklearn option to spread work across processors#
clf = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
scores_1 = cross_validate(clf, x_total, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)
show_results(scores_1)

NameError: name 'x_total' is not defined

In [70]:
##SKIP for now##
###look into incremental training#
#Look into sklearn option to spread work across processors#
clf = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
clf.fit(x_train, encoded_y_train)
start_time = time.time()
y_pre_1 = clf.predict(x_test)
elapsed_time = time.time() - start_time
print elapsed_time
y_pos_1 = clf.predict_proba(x_test)
show_test_results(encoded_y_test, y_pre_1, y_pos_1, class_weight)

62.9412081242
precision_micro:0.92
recall_micro:0.88
f1_micro:0.90
precision_weighted:0.88
recall_weighted:0.86
f1_weighted:0.87
risk_factor:0.2360


In [17]:
clf_rf = RandomForestClassifier()
scores_2 = cross_validate(clf_rf, x_total, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [18]:
show_results(scores_2)

test_precision_weighted:0.86
test_recall_weighted:0.70
test_f1_weighted:0.72
test_precision_micro:0.89
test_recall_micro:0.69
test_f1_micro:0.78


In [19]:
clf_rf = RandomForestClassifier()
clf_rf.fit(x_train, encoded_y_train)
start_time = time.time()
y_pre_2 = clf_rf.predict(x_test)
elapsed_time = time.time() - start_time
print elapsed_time
y_pos_2 = clf_rf.predict_proba(x_test)
show_test_results(encoded_y_test, y_pre_2, trans_prob(y_pos_2), class_weight)



0.104833841324
precision_micro:0.90
recall_micro:0.71
f1_micro:0.79
precision_weighted:0.79
recall_weighted:0.68
f1_weighted:0.70
risk_factor:0.4160


In [14]:
clf_dt = DecisionTreeClassifier()
scores_3 = cross_validate(clf_dt, x_total, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [15]:
show_results(scores_3)

test_precision_weighted:0.80
test_recall_weighted:0.81
test_f1_weighted:0.80
test_precision_micro:0.82
test_recall_micro:0.81
test_f1_micro:0.81


In [16]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(x_train, encoded_y_train)
start_time = time.time()
y_pre_3 = clf_dt.predict(x_test)
elapsed_time = time.time() - start_time
print elapsed_time
y_pos_3 = clf_dt.predict_proba(x_test)
show_test_results(encoded_y_test, y_pre_3, trans_prob(y_pos_3), class_weight)

0.0083110332489
precision_micro:0.84
recall_micro:0.82
f1_micro:0.83
precision_weighted:0.81
recall_weighted:0.80
f1_weighted:0.80
risk_factor:0.2785


In [28]:
##TAP##
##K Nearest Neighbor##
#Distance#
from sklearn import neighbors
n_neighbors = 15
weights = 'distance'

for i_n_neighbors in range(5,n_neighbors+1):
    clf_knn = neighbors.KNeighborsClassifier(i_n_neighbors, weights=weights)
    scores_3 = cross_validate(clf_knn, x_total, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)
    print('------------------------------------->')
    print('Distance: n_neigh',i_n_neighbors)
    show_results(scores_3)
    

------------------------------------->
('Distance: n_neigh', 5)
test_precision_weighted:0.78
test_recall_weighted:0.77
test_f1_weighted:0.75
test_precision_micro:0.82
test_recall_micro:0.76
test_f1_micro:0.79
------------------------------------->
('Distance: n_neigh', 6)
test_precision_weighted:0.79
test_recall_weighted:0.77
test_f1_weighted:0.76
test_precision_micro:0.82
test_recall_micro:0.77
test_f1_micro:0.79
------------------------------------->
('Distance: n_neigh', 7)
test_precision_weighted:0.79
test_recall_weighted:0.76
test_f1_weighted:0.75
test_precision_micro:0.82
test_recall_micro:0.75
test_f1_micro:0.78
------------------------------------->
('Distance: n_neigh', 8)
test_precision_weighted:0.79
test_recall_weighted:0.77
test_f1_weighted:0.75
test_precision_micro:0.83
test_recall_micro:0.75
test_f1_micro:0.79
------------------------------------->
('Distance: n_neigh', 9)
test_precision_weighted:0.79
test_recall_weighted:0.75
test_f1_weighted:0.74
test_precision_micro:0.

In [21]:
clf_knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf_knn.fit(x_train, encoded_y_train)
start_time = time.time()
y_pre_3 = clf_knn.predict(x_test)
elapsed_time = time.time() - start_time
print elapsed_time
y_pos_3 = clf_knn.predict_proba(x_test)
show_test_results(encoded_y_test, y_pre_3, trans_prob(y_pos_3), class_weight)

1.08958005905
precision_micro:0.86
recall_micro:0.74
f1_micro:0.80
precision_weighted:0.80
recall_weighted:0.73
f1_weighted:0.74
risk_factor:0.3889


In [18]:
from sklearn import neighbors
n_neighbors = 15
weights = 'uniform'

for i_n_neighbors in range(5,n_neighbors+1):
    clf_knn = neighbors.KNeighborsClassifier(i_n_neighbors, weights=weights)
    scores_3 = cross_validate(clf_knn, x_total, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)
    print('------------------------------------->')
    print('Uniform: n_neigh',i_n_neighbors)
    show_results(scores_3)

------------------------------------->
('Uniform: n_neigh', 5)
test_precision_weighted:0.77
test_recall_weighted:0.75
test_f1_weighted:0.74
test_precision_micro:0.81
test_recall_micro:0.75
test_f1_micro:0.78
------------------------------------->
('Uniform: n_neigh', 6)
test_precision_weighted:0.79
test_recall_weighted:0.71
test_f1_weighted:0.72
test_precision_micro:0.85
test_recall_micro:0.70
test_f1_micro:0.77
------------------------------------->
('Uniform: n_neigh', 7)
test_precision_weighted:0.77
test_recall_weighted:0.74
test_f1_weighted:0.73
test_precision_micro:0.81
test_recall_micro:0.74
test_f1_micro:0.77
------------------------------------->
('Uniform: n_neigh', 8)
test_precision_weighted:0.79
test_recall_weighted:0.71
test_f1_weighted:0.71
test_precision_micro:0.84
test_recall_micro:0.70
test_f1_micro:0.76
------------------------------------->
('Uniform: n_neigh', 9)
test_precision_weighted:0.77
test_recall_weighted:0.73
test_f1_weighted:0.72
test_precision_micro:0.81
te

In [19]:
show_results(scores_3)

test_precision_weighted:0.80
test_recall_weighted:0.74
test_f1_weighted:0.72
test_precision_micro:0.82
test_recall_micro:0.72
test_f1_micro:0.77


In [20]:
clf_knn = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf_knn.fit(x_train, encoded_y_train)
start_time = time.time()
y_pre_3 = clf_knn.predict(x_test)
elapsed_time = time.time() - start_time
print elapsed_time
y_pos_3 = clf_knn.predict_proba(x_test)
show_test_results(encoded_y_test, y_pre_3, trans_prob(y_pos_3), class_weight)

3.68601703644
precision_micro:0.86
recall_micro:0.74
f1_micro:0.80
precision_weighted:0.80
recall_weighted:0.73
f1_weighted:0.74
risk_factor:0.3871


In [17]:
heap = []
for idx, importance in enumerate(clf_dt.feature_importances_):
    heap.append((importance, idx))
heap.sort(reverse = True)
for item in heap[:30]:
    print vectorizer.get_feature_names()[item[1]]
    print item[0]

benadryl en route
0.06767252225826034
golf ball
0.06209289895747574
address store
0.05219661975393536
front driver side corner
0.042514874406786744
chin lift
0.034479024135152206
patient lying unresponsive
0.0312960987037582
breath away
0.029438043227958434
drank cold water
0.02500171368974516
look well
0.017344479974113754
ankle pain
0.016592926589528927
hospitalized multiple
0.014676397430999638
child care issue
0.014098247445898708
oxygen use
0.013201628921830426
past feel
0.010114745036779107
equal mobility
0.008374562373298294
house yesterday
0.007889074885254187
ambulatory onscene
0.007828201796429762
hallway laying
0.0074822465419892
medication administraton
0.007366061866284456
free other
0.007141535147213692
current medicaiton
0.006911205931570831
insulin pump
0.00658760343591258
confused behavior
0.005364718922628631
initial fall
0.005299895046551835
inhaler use
0.0051942454992976615
black female lying supine
0.005153685338033144
anterior part
0.00464826572332659
intubated nu

In [23]:
clf_4 = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
scores_4 = cross_validate(clf_4, x_total_ngram, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [24]:
show_results(scores_4)

test_precision_weighted:0.82
test_recall_weighted:0.75
test_f1_weighted:0.76
test_precision_micro:0.88
test_recall_micro:0.75
test_f1_micro:0.81


In [26]:
clf_4 = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
clf_4.fit(x_train_ngram, encoded_y_train)
y_pre_4 = clf_4.predict(x_test_ngram)
y_pos_4 = clf_4.predict_proba(x_test_ngram)
show_test_results(encoded_y_test, y_pre_4, y_pos_4, class_weight)

precision_micro:0.89
recall_micro:0.77
f1_micro:0.83
precision_weighted:0.83
recall_weighted:0.77
f1_weighted:0.78
risk_factor:0.3177


In [153]:
clf_rf_1 = RandomForestClassifier()
scores_5 = cross_validate(clf_rf_1, x_total_ngram, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [154]:
show_results(scores_5)

test_precision_weighted:0.83
test_recall_weighted:0.66
test_f1_weighted:0.69
test_precision_micro:0.88
test_recall_micro:0.66
test_f1_micro:0.75


In [183]:
clf_rf_1 = RandomForestClassifier()
clf_rf_1.fit(x_train_ngram,encoded_y_train)
y_pre_5 = clf_rf_1.predict(x_test_ngram)
y_pos_5 = clf_rf_1.predict_proba(x_test_ngram)
show_test_results(encoded_y_test, y_pre_5, trans_prob(y_pos_5), class_weight)

precision_micro:0.88
recall_micro:0.66
f1_micro:0.76
precision_weighted:0.82
recall_weighted:0.67
f1_weighted:0.69
risk_factor:0.4570


In [118]:
y_pos_5_c = [[0.] * len(y_pos_5) for _ in range(len(y_pos_5[0]))]
for idx, res in enumerate(y_pos_5):
    for i, p in enumerate(res):
        if len(p) < 2: y_pos_5_c[i][idx] = 1. - p[0]
        else: y_pos_5_c[i][idx] = p[1]

risk = []
for case in y_pos_5_c:
    r = 0
    for i,pos in enumerate(case):
        r += pos * num2risk[num2int[i]]
    risk.append(r)
print sum(risk)/len(risk)

8.03353867214


In [18]:
clf_dt_1 = DecisionTreeClassifier()
scores_6 = cross_validate(clf_dt_1, x_total_ngram, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [19]:
show_results(scores_6)

test_precision_weighted:0.73
test_recall_weighted:0.74
test_f1_weighted:0.73
test_precision_micro:0.75
test_recall_micro:0.74
test_f1_micro:0.75


In [20]:
clf_dt_1 = DecisionTreeClassifier()
clf_dt_1.fit(x_train_ngram,encoded_y_train)
y_pre_6 = clf_dt_1.predict(x_test_ngram)
y_pos_6 = clf_dt_1.predict_proba(x_test_ngram)

show_test_results(encoded_y_test, y_pre_6, trans_prob(y_pos_6), class_weight)

precision_micro:0.77
recall_micro:0.75
f1_micro:0.76
precision_weighted:0.73
recall_weighted:0.72
f1_weighted:0.72
risk_factor:0.4762


In [121]:
y_pos_6_c = [[0.] * len(y_pos_6) for _ in range(len(y_pos_6[0]))]
for idx, res in enumerate(y_pos_6):
    for i, p in enumerate(res):
        if len(p) < 2: y_pos_6_c[i][idx] = 1. - p[0]
        else: y_pos_6_c[i][idx] = p[1]

risk = []
for case in y_pos_6_c:
    r = 0
    for i,pos in enumerate(case):
        r += pos * num2risk[num2int[i]]
    risk.append(r)
print sum(risk)/len(risk)

7.92539356605


In [89]:
clf_7 = OneVsRestClassifier(SVC(probability=True, class_weight = inter_safety_dic))
# cv_results = cross_validate(clf, x_train, y_train, cv=5)
clf_7.fit(x_train,encoded_y_train)
y_pre_7 = clf_7.predict(x_test)
y_pos_7 = clf_7.predict_proba(x_test)

print precision_score(encoded_y_test,y_pre_7,average = 'weighted')
print precision_score(encoded_y_test,y_pre_7,average = 'micro')
print recall_score(encoded_y_test,y_pre_7,average = 'weighted')
print recall_score(encoded_y_test,y_pre_7,average = 'micro')
print f1_score(encoded_y_test,y_pre_7,average = 'weighted')
print f1_score(encoded_y_test,y_pre_7,average = 'micro')

ValueError: Class label 2 not present.

In [156]:
clf_rf_2 = RandomForestClassifier(class_weight = inter_safety)
scores_7 = cross_validate(clf_rf_2, x_total, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [157]:
show_results(scores_7)

test_precision_weighted:0.84
test_recall_weighted:0.65
test_f1_weighted:0.67
test_precision_micro:0.88
test_recall_micro:0.64
test_f1_micro:0.74


In [185]:
clf_rf_2 = RandomForestClassifier(class_weight = inter_safety)
clf_rf_2.fit(x_train,encoded_y_train)
y_pre_8 = clf_rf_2.predict(x_test)
y_pos_8 = clf_rf_2.predict_proba(x_test)
show_test_results(encoded_y_test, y_pre_8, trans_prob(y_pos_8), class_weight)

precision_micro:0.87
recall_micro:0.64
f1_micro:0.74
precision_weighted:0.79
recall_weighted:0.65
f1_weighted:0.66
risk_factor:0.4941


In [21]:
clf_dt_2 = DecisionTreeClassifier(class_weight = inter_safety)
scores_8 = cross_validate(clf_dt_2, x_total, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [22]:
show_results(scores_8)

test_precision_weighted:0.77
test_recall_weighted:0.78
test_f1_weighted:0.77
test_precision_micro:0.79
test_recall_micro:0.78
test_f1_micro:0.78


In [23]:
clf_dt_2 = DecisionTreeClassifier(class_weight = inter_safety)
clf_dt_2.fit(x_train,encoded_y_train)
y_pre_9 = clf_dt_2.predict(x_test)
y_pos_9 = clf_dt_2.predict_proba(x_test)
show_test_results(encoded_y_test, y_pre_9, trans_prob(y_pos_9), class_weight)

precision_micro:0.80
recall_micro:0.79
f1_micro:0.80
precision_weighted:0.77
recall_weighted:0.78
f1_weighted:0.77
risk_factor:0.3803


In [163]:
clf_rf_3 = RandomForestClassifier(class_weight = inter_safety)
scores_9 = cross_validate(clf_rf_3, x_total_ngram, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [164]:
show_results(scores_9)

test_precision_weighted:0.82
test_recall_weighted:0.62
test_f1_weighted:0.64
test_precision_micro:0.88
test_recall_micro:0.61
test_f1_micro:0.72


In [24]:
clf_rf_3 = RandomForestClassifier(class_weight = inter_safety)
clf_rf_3.fit(x_train_ngram,encoded_y_train)
y_pre_10 = clf_rf_3.predict(x_test_ngram)
y_pos_10 = clf_rf_3.predict_proba(x_test_ngram)
show_test_results(encoded_y_test, y_pre_10, trans_prob(y_pos_10), class_weight)



precision_micro:0.87
recall_micro:0.61
f1_micro:0.72
precision_weighted:0.73
recall_weighted:0.61
f1_weighted:0.62
risk_factor:0.4827


In [25]:
clf_dt_3 = DecisionTreeClassifier(class_weight = inter_safety)
scores_10 = cross_validate(clf_dt_3, x_total_ngram, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)

In [26]:
show_results(scores_10)

test_precision_weighted:0.71
test_recall_weighted:0.73
test_f1_weighted:0.71
test_precision_micro:0.74
test_recall_micro:0.72
test_f1_micro:0.73


In [27]:
clf_dt_3 = DecisionTreeClassifier(class_weight = inter_safety)
clf_dt_3.fit(x_train_ngram,encoded_y_train)
y_pre_11 = clf_dt_3.predict(x_test_ngram)
y_pos_11 = clf_dt_3.predict_proba(x_test_ngram)
show_test_results(encoded_y_test, y_pre_11, trans_prob(y_pos_11), class_weight)

precision_micro:0.75
recall_micro:0.73
f1_micro:0.74
precision_weighted:0.71
recall_weighted:0.70
f1_weighted:0.70
risk_factor:0.5455


In [28]:
clf_dt_4 = DecisionTreeClassifier()
clf_dt_4.fit(x_train_ngram,encoded_y_train)
y_pre_12 = clf_dt_4.predict(x_test_ngram)
y_pos_12 = clf_dt_4.predict_proba(x_test_ngram)
y_pre_12, y_pos_12 = filtering(y_pre_12, trans_prob(y_pos_12), .11)
show_test_results(encoded_y_test, y_pre_12, y_pos_12, class_weight)


precision_micro:0.78
recall_micro:0.74
f1_micro:0.76
precision_weighted:0.73
recall_weighted:0.72
f1_weighted:0.72
risk_factor:0.4641


In [29]:
clf_dt_5 = DecisionTreeClassifier()
clf_dt_5.fit(x_train,encoded_y_train)
y_pre_13 = clf_dt_5.predict(x_test)
y_pos_13 = clf_dt_5.predict_proba(x_test)
y_pre_13, y_pos_13 = filtering(y_pre_13, trans_prob(y_pos_13), .11)
show_test_results(encoded_y_test, y_pre_13, y_pos_13, class_weight)

precision_micro:0.84
recall_micro:0.81
f1_micro:0.83
precision_weighted:0.81
recall_weighted:0.80
f1_weighted:0.80
risk_factor:0.2903


In [191]:
clf_rf_4 = RandomForestClassifier()
clf_rf_4.fit(x_train_ngram,encoded_y_train)
y_pre_14 = clf_rf_4.predict(x_test_ngram)
y_pos_14 = clf_rf_4.predict_proba(x_test_ngram)
y_pre_14, y_pos_14 = filtering(y_pre_14, trans_prob(y_pos_14), .11)
show_test_results(encoded_y_test, y_pre_14, y_pos_14, class_weight)

precision_micro:0.88
recall_micro:0.65
f1_micro:0.75
precision_weighted:0.78
recall_weighted:0.65
f1_weighted:0.66
risk_factor:0.4262


In [192]:
clf_rf_5 = RandomForestClassifier()
clf_rf_5.fit(x_train,encoded_y_train)
y_pre_15 = clf_rf_5.predict(x_test)
y_pos_15 = clf_rf_5.predict_proba(x_test)
y_pre_15, y_pos_15 = filtering(y_pre_15, trans_prob(y_pos_15), .11)
show_test_results(encoded_y_test, y_pre_15, y_pos_15, class_weight)

precision_micro:0.90
recall_micro:0.72
f1_micro:0.80
precision_weighted:0.83
recall_weighted:0.72
f1_weighted:0.74
risk_factor:0.3941


In [65]:
# clfs using feature vectors
clf_16 = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
scores_16 = cross_validate(clf_16, train_vec, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)
show_results(scores_16)

test_precision_weighted:0.64
test_recall_weighted:0.64
test_f1_weighted:0.62
test_precision_micro:0.83
test_recall_micro:0.65
test_f1_micro:0.73


In [17]:
clf_16 = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
clf_16.fit(train_vec, encoded_y_total)
y_pre_16 = clf_16.predict(test_vec)
y_pos_16 = clf_16.predict_proba(test_vec)
show_test_results(encoded_y_test, y_pre_16, y_pos_16, class_weight)

precision_micro:0.81
recall_micro:0.64
f1_micro:0.72
precision_weighted:0.62
recall_weighted:0.63
f1_weighted:0.61
risk_factor:0.4308


In [66]:
clf_rf_6 = RandomForestClassifier()
scores_17 = cross_validate(clf_rf_6, train_vec, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)
show_results(scores_17)

test_precision_weighted:0.72
test_recall_weighted:0.66
test_f1_weighted:0.67
test_precision_micro:0.80
test_recall_micro:0.66
test_f1_micro:0.72


In [77]:
clf_rf_6 = RandomForestClassifier()
clf_rf_6.fit(train_vec, encoded_y_total)
y_pre_17 = clf_rf_6.predict(test_vec)
y_pos_17 = clf_rf_6.predict_proba(test_vec)
show_test_results(encoded_y_test, y_pre_17, trans_prob(y_pos_17), class_weight)

precision_micro:0.76
recall_micro:0.60
f1_micro:0.67
precision_weighted:0.65
recall_weighted:0.60
f1_weighted:0.60
risk_factor:0.5956


In [30]:
clf_dt_6 = DecisionTreeClassifier()
scores_18 = cross_validate(clf_dt_6, train_vec, encoded_y_total, scoring=scoring,
                         cv=5, n_jobs=-1, return_train_score=False, return_estimator=True)
show_results(scores_18)

NameError: name 'train_vec' is not defined

In [31]:
clf_dt_6 = DecisionTreeClassifier()
clf_dt_6.fit(train_vec, encoded_y_total)
y_pre_18 = clf_dt_6.predict(test_vec)
y_pos_18 = clf_dt_6.predict_proba(test_vec)
show_test_results(encoded_y_test, y_pre_18, trans_prob(y_pos_18), class_weight)

NameError: name 'train_vec' is not defined

In [91]:
clf_19 = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
clf_19.fit(x_train, encoded_y_train)
y_pre_19 = clf_19.predict(x_test)
y_pos_19 = clf_19.predict_proba(x_test)
y_pre_19, y_pos_19 = filtering(y_pre_19, y_pos_19, .11)
show_test_results(encoded_y_test, y_pre_19, y_pos_19, class_weight)

precision_micro:0.92
recall_micro:0.88
f1_micro:0.90
precision_weighted:0.88
recall_weighted:0.86
f1_weighted:0.87
risk_factor:0.2294


In [92]:
clf_20 = OneVsRestClassifier(SVC(kernel = 'linear', probability=True))
clf_20.fit(x_train_ngram, encoded_y_train)
y_pre_20 = clf_20.predict(x_test_ngram)
y_pos_20 = clf_20.predict_proba(x_test_ngram)
y_pre_20, y_pos_20 = filtering(y_pre_20, y_pos_20, .11)
show_test_results(encoded_y_test, y_pre_20, y_pos_20, class_weight)

precision_micro:0.89
recall_micro:0.77
f1_micro:0.83
precision_weighted:0.83
recall_weighted:0.77
f1_weighted:0.78
risk_factor:0.3084


In [93]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preprocess at 0x1a509c9b18>,
        smooth_idf=True, stop_words='english', strip_accents=None,
        sublinear_tf=False, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)

In [32]:
with open('unigram_vectorizer.txt', 'w') as uv:
    pickle.dump(vectorizer, uv)

In [33]:
with open('linear_svmclf_unigram.txt', 'w') as ls:
    pickle.dump(clf, ls)

NameError: name 'clf' is not defined

In [34]:
with open('linear_svmclf_feat_vec.txt', 'w') as ls_vec:
    pickle.dump(clf_16, ls_vec)

NameError: name 'clf_16' is not defined