In [1]:
import pandas as pd
import numpy as np
import string
import re

from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
path = './../data/'

In [3]:
data = pd.read_csv(f"{path}AnonymizedClinicalAbbreviationsAndAcronymsDataSet.txt", 
                   encoding='cp1252', 
                   sep="|", 
                   header=None,
                   na_filter=False)
data.columns = ["abbrev", "sense", "represntaion", "start_pos", "end_pos", "section_info", "sample"]

In [4]:
data.head()

Unnamed: 0,abbrev,sense,represntaion,start_pos,end_pos,section_info,sample
0,AB,abortion,AB.,231,233,,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...
1,AB,abortion,AB.,249,251,,She is now bleeding quite heavily. Ultrasound ...
2,AB,abortion,AB,223,224,PAST OB HISTORY,ALLERGIES: Heparin and Imitrex. PAST OB HISTOR...
3,AB,abortion,AB.,194,196,HISTORY OF THE PRESENT ILLNESS,She had a pelvic ultrasound at Park Nicollet o...
4,AB,abortion,AB,114,115,PAST OB-GYN HISTORY,"On _%#MMDD2007#%_, normal anatomy with anterio..."


In [5]:
unique_abbrev = np.unique(data.abbrev)

In [6]:
empty_dict = dict.fromkeys(['abbrev','number_sense'])

In [7]:
abbrev_freq = pd.DataFrame(columns=["abbrev", "number_sense", "sense", "freq", "percentage"])
#abbrev_freq_dict = dict.fromkeys(['abbrev','number_sense'])

count_list = []
for abbrev in unique_abbrev:
    piece = data.loc[data.abbrev == abbrev]
    senses = np.unique(piece.sense)
    count = len(np.unique(piece.sense))
    for sense in senses:
        count_sense = piece.loc[piece.sense == sense].shape[0]
        percentage = count_sense/piece.shape[0]
        new = pd.DataFrame({"abbrev" : [abbrev],
                            "number_sense" : [count],
                            "sense" : [sense],
                            "freq" : [count_sense],
                            "percentage" : [percentage]})
        
        abbrev_freq = pd.concat([abbrev_freq, new])
    #count_list.append(count)
    
#abbrev_freq = pd.DataFrame({"abbrev" : unique_abbrev, "number_sense" : count_list})

In [8]:
np.unique(abbrev_freq.abbrev)

array(['AB', 'AC', 'ALD', 'AMA', 'ASA', 'AV', 'AVR', 'BAL', 'BK', 'BM',
       'BMP', 'C&S', 'C3', 'C4', 'CA', 'CDI', 'CEA', 'CR', 'CTA', 'CVA',
       'CVP', 'CVS', 'DC', 'DIP', 'DM', 'DT', 'EC', 'ER', 'ES', 'ET',
       'FISH', 'FSH', 'GT', 'IA', 'IB', 'IM', 'IR', 'IT', 'ITP', 'IVF',
       'LA', 'LE', 'MOM', 'MP', 'MR', 'MS', 'MSSA', 'NA', 'NAD', 'NP',
       'OP', 'OR', 'OTC', 'PA', 'PAC', 'PCP', 'PD', 'PDA', 'PE', 'PM',
       'PR', 'PT', 'RA', 'RT', 'SA', 'SBP', 'SMA', 'SS', 'T1', 'T2', 'T3',
       'T4', 'US', 'VAD', 'VBG'], dtype=object)

In [9]:
abbrev_freq.loc[abbrev_freq.abbrev == "AMA"]

Unnamed: 0,abbrev,number_sense,sense,freq,percentage
0,AMA,3,advanced maternal age,31,0.062
0,AMA,3,against medical advice,444,0.888
0,AMA,3,antimitochondrial antibody,25,0.05


In [10]:
# feature extraction
# if select AC

In [11]:
def derive_features(abbrev, window_size, replace = True):  # replace = True, replace numbers with zero
    samples = data.loc[data.abbrev == abbrev, ]
    sample_num = 1
    
    original_features = pd.DataFrame(columns=["id", "features", "sense"])
    direction_features = pd.DataFrame(columns=["id", "features", "sense"])
    direction_num_features = pd.DataFrame(columns=["id", "features", "sense"])
    
    for i in range(samples.shape[0]):  # for each data point
        sentence_num = 1
        #target_word = samples.iloc[i, 2]  # should avoid using this, as word tokenization would split AC. to AC ., etc.
        #target_word = abbrev
        
        text = samples.iloc[i, 6]
        id = i+1  # this is the row number of selected abbreviation
        sense = samples.iloc[i, 1]
        
        target_word = samples.iloc[i, 2]
        sentence_length = 0
        start_pos = int(samples.iloc[i, 3])
        end_pos = int(samples.iloc[i, 4])
        
        target_word = text[start_pos : end_pos+1]
        detect = True
        #print(target_word)
        # sentence boundary
        # one sample can have multiple abbreviations in different sentences.
        # before sentence boundary, replace all the numbers with 0 
        #print(text)
        
        
        #if replace:
        #    text = re.sub("\d+", "0", text)
        #    text = re.sub("\d+\.\d+", "0", text)

        sents = sent_tokenize(text)
        exclude = set(string.punctuation)
        #s = ''.join(ch for ch in s if ch not in exclude)
        for sent in sents:  # for each sentence
            #words = [token.lower() for token in word_tokenize(sent)] # word tokenization
            # remove punctuations from the words list
            sentence_length += (len(sent)+1)
            words = [word for word in word_tokenize(sent) if word not in string.punctuation]
            
#             print(id, "____________________")
#             print(sent)
#             print(words)
#             print(target_word)
#             print(sentence_length, end_pos)
            if sentence_length >= end_pos and (abbrev in words or target_word in words) and detect:
                #print(words)
                #print(sent)
                detect = False
                left_features = []
                right_features = []
                
                left_features_direction = []
                right_features_direction = []
                
                left_features_direction_num = []
                right_features_direction_num = []
                
                index = words.index(abbrev)
                # find the targeted word
                # 1. See if the window-size exceeds the front and back limit
                    # If yes, start from the zero-th element, towards right till find the target (features on the left)
                            # start from the max-th element, towards left till find the target (features on the right)
                    # If no, start from the (index - 5)-th element, towards right till find the target
                           # start from the (index - 5)-th element, towards left till find the target
                        
                # extract features on the left
                if index - window_size < 0:
                    j = 0
                    starting = index
                    #while words[j] != target_word:
                    while words[j] != abbrev:
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        # remove punctuation from the word, meant to fix problem in word tokenization
                        # but may cause problem, e.g. p.r.n --> prn
                        # may remove later
                        
                        left_features.append(words[j].lower())
                        left_features_direction.append("L-" + words[j].lower())
                        left_features_direction_num.append("L" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting -= 1
                else:
                    j = index-window_size
                    starting = 0
                    for k in range(window_size):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        left_features.append(words[j].lower())
                        left_features_direction.append("L-" + words[j].lower())
                        left_features_direction_num.append("L" + str(window_size-starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                
                # extract feature on the right
                if index + window_size >= len(words):
                    #j = len(words)-1
                    j = index+1
                    starting = 1
                    while j != len(words):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        right_features.append(words[j].lower())
                        right_features_direction.append("R-" + words[j].lower())
                        right_features_direction_num.append("R" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                else: 
                    j = index+1
                    starting = 1
                    for k in range(window_size):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        right_features.append(words[j].lower())
                        right_features_direction.append("R-" + words[j].lower())
                        right_features_direction_num.append("R" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                   
                left_features_str = " ".join(left_features)
                right_features_str = " ".join(right_features)
                features = left_features_str + " " + right_features_str
                
#                 print(features)
                
                left_features_direction_str = " ".join(left_features_direction)
                right_features_direction_str = " ".join(right_features_direction)
                features_direction = left_features_direction_str + " " + right_features_direction_str
                
                left_features_direction_num_str = " ".join(left_features_direction_num)
                right_features_direction_num_str = " ".join(right_features_direction_num)
                features_direction_num = left_features_direction_num_str + " " + right_features_direction_num_str
                #print(features_direction_num)
                
#                 if replace:
#                     features = re.sub("\d+", "0", features)
#                     features = re.sub("\d+\.\d+", "0", features)
                    
#                     features_direction = re.sub("\d+", "0", features_direction)
#                     features_direction = re.sub("\d+\.\d+", "0", features_direction)
                    
#                     features_direction_num = features_direction_num+re.sub("\d+", "0", features_direction_num)
#                     features_direction_num = features_direction_num+re.sub("\d+\.\d+", "0", features_direction_num)
#                     print(features_direction_num)
                original_features = pd.concat([original_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features], 
                                                                               "sense" : [sense]})])
                
                direction_features = pd.concat([direction_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features_direction], 
                                                                               "sense" : [sense]})])
                
                direction_num_features = pd.concat([direction_num_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features_direction_num], 
                                                                               "sense" : [sense]})])
            sentence_num += 1
        sample_num += 1
    return([original_features, direction_features, direction_num_features])
                

In [12]:
ar1, br1, cr1 = derive_features('AMA', 5, replace=True)

# ONE-HOT

In [13]:
def one_hot_features(abbrev, window_size, replace = True):
    ar1, br1, cr1 = derive_features(abbrev, window_size, replace = True)
    one_hot_vector=[]
    for k in cr1['features'].values:
        features_direction_num= [item for item in k.split(' ') if item !='']
        one_hot_vector=one_hot_vector+features_direction_num
    for k in ar1['features'].values:
        features= [item for item in k.split(' ') if item !='']
        one_hot_vector=one_hot_vector+features
    for k in br1['features'].values:
        features_num= [item for item in k.split(' ') if item !='']
        one_hot_vector=one_hot_vector+features_num
    one_hot_vector=sorted(list(set(one_hot_vector)))
    char_to_int = dict((c, i) for i, c in enumerate(one_hot_vector))
    
    onehot_encoded = list()
    for k in cr1['features'].values:
        features_direction_num= [item for item in k.split(' ') if item !='']
        integer_encoded = [char_to_int[char] for char in features_direction_num]
        letter = [0 for _ in range(len(char_to_int))]
        for value in integer_encoded:
            letter[value] = 1
        onehot_encoded.append(letter)
    for k,v in enumerate(ar1['features'].values):
        features= [item for item in v.split(' ') if item !='']
        integer_encoded = [char_to_int[char] for char in features]
        for value in integer_encoded:
            onehot_encoded[k][value] = 1
    for k,v in enumerate(br1['features'].values):
        features_num= [item for item in v.split(' ') if item !='']
        integer_encoded = [char_to_int[char] for char in features_num]
        for value in integer_encoded:
            onehot_encoded[k][value] = 1
    onehot_encoded=np.array(onehot_encoded)
    return onehot_encoded,one_hot_vector

In [14]:
# one hot features for single features
def one_hot_features_single(abbrev, window_size, feature_no = 1, replace = True):
    # feature_no = 1: original features
    # feature_no = 2: directional features
    # feature_no = 3: directional features with number
    
    ar1, br1, cr1 = derive_features(abbrev, window_size, replace = True)
    one_hot_vector=[]

    if feature_no == 1:
        feature_used = ar1.copy()
    elif feature_no == 2:
        feature_used = br1.copy()
    elif feature_no == 3:
        feature_used = cr1.copy()
    else:
        print("Error input: [feature_no]!")
    
    for k in feature_used["features"].values:
        features= [item for item in k.split(' ') if item !='']
        one_hot_vector=one_hot_vector+features
        
    one_hot_vector=sorted(list(set(one_hot_vector)))
    char_to_int = dict((c, i) for i, c in enumerate(one_hot_vector))
    
    onehot_encoded = list()
    for k in feature_used["features"].values:
        features_direction_num= [item for item in k.split(' ') if item !='']
        integer_encoded = [char_to_int[char] for char in features_direction_num]
        letter = [0 for _ in range(len(char_to_int))]
        for value in integer_encoded:
            letter[value] = 1
        onehot_encoded.append(letter)
    
    onehot_encoded=np.array(onehot_encoded)
    
    return onehot_encoded,one_hot_vector

In [15]:
# one hot features for two features(one feature,two feature and three feature can use same function, not change that as others have finished)
def one_hot_features_double(abbrev, window_size, feature_no = 1,feature_no_2 = 2, replace = True):
    # feature_no = 1: original features
    # feature_no = 2: directional features
    # feature_no = 3: directional features with number
    
    ar1, br1, cr1 = derive_features(abbrev, window_size, replace = True)
    one_hot_vector=[]

    if feature_no == 1:
        feature_used = ar1.copy()
    elif feature_no == 2:
        feature_used = br1.copy()
    elif feature_no == 3:
        feature_used = cr1.copy()
    else:
        print("Error input: [feature_no]!")
    if feature_no_2 == 1:
        feature_used_2 = ar1.copy()
    elif feature_no_2 == 2:
        feature_used_2 = br1.copy()
    elif feature_no_2 == 3:
        feature_used_2 = cr1.copy()
    else:
        print("Error input: [feature_no_2]!")   
    for k in feature_used["features"].values:
        features= [item for item in k.split(' ') if item !='']
        one_hot_vector=one_hot_vector+features
    for k in feature_used_2["features"].values:
        features= [item for item in k.split(' ') if item !='']
        one_hot_vector=one_hot_vector+features    
    one_hot_vector=sorted(list(set(one_hot_vector)))
    char_to_int = dict((c, i) for i, c in enumerate(one_hot_vector))
    
    onehot_encoded = list()
    for k in feature_used["features"].values:
        features_direction_num= [item for item in k.split(' ') if item !='']
        integer_encoded = [char_to_int[char] for char in features_direction_num]
        letter = [0 for _ in range(len(char_to_int))]
        for value in integer_encoded:
            letter[value] = 1
        onehot_encoded.append(letter)
    for k,v in enumerate(feature_used_2['features'].values):
        features= [item for item in v.split(' ') if item !='']
        integer_encoded = [char_to_int[char] for char in features]
        for value in integer_encoded:
            onehot_encoded[k][value] = 1
    onehot_encoded=np.array(onehot_encoded)
    
    return onehot_encoded,one_hot_vector

In [16]:
features = {}
names = {}

for i in range(1,6):
    features[i],names[i]=one_hot_features('AMA', i, replace=True)#not use ald as they have number of sense=1


In [17]:
len(names[1])

505

In [18]:
# fea_5,name_5=one_hot_features('AMA', 5, replace=True)#not use ald as they have number of sense=1
# fea_4,name_4=one_hot_features('AMA', 4, replace=True)
# fea_3,name_3=one_hot_features('AMA', 3, replace=True)
# fea_2,name_2=one_hot_features('AMA', 2, replace=True)
# fea_1,name_1=one_hot_features('AMA', 1, replace=True)

# machine learning to decide window size

In [19]:
from sklearn.model_selection import train_test_split

y=cr1['sense'].values

train_X = {} 
test_X = {}
train_y = {}
test_y = {}

for i in range(1,6):
    train_X[i],test_X[i],train_y[i],test_y[i] = train_test_split(features[i], y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
    

In [20]:
# train_X_5, test_X_5, train_y, test_y = train_test_split(fea_5, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
# train_X_4, test_X_4, train_y, test_y = train_test_split(fea_4, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
# train_X_3, test_X_3, train_y, test_y = train_test_split(fea_3, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
# train_X_2, test_X_2, train_y, test_y = train_test_split(fea_2, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
# train_X_1, test_X_1, train_y, test_y = train_test_split(fea_1, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

def five_fold_CV(clf, params, dx, dy):
    cv_model = RandomizedSearchCV(clf, params, scoring='f1_weighted', n_jobs=-1, 
                                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=13), 
                                  verbose=1, n_iter=50, refit=True)
    
    cv_model.fit(dx, dy)
    
    return cv_model.best_estimator_

In [22]:
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

def logistic_regression_pred(X_train, Y_train, X_test):
    lr = LogisticRegression()
    tuned_parameters = {'max_iter': [10, 50, 100, 200, 500, 750, 1000], 
                    'tol': [0.0001, 0.001, 0.01, 0.1],
                    'C': [0.01, 0.1, 1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
                    'solver': ['lbfgs', 'liblinear', 'newton-cg'], 
                    'class_weight': [None, 'balanced']}
    
    best_lr_model = five_fold_CV(lr, tuned_parameters, X_train, Y_train)
    lr_y_test = best_lr_model.predict(X_test)
    return lr_y_test

def svm_pred(X_train, Y_train, X_test):
    svm = SVC()
    tuned_parameters = {"C": [0.001, 0.01, 0.1, 2, 8, 32, 64, 128, 512, 1024, 2048],
                    'gamma':['scale', 'auto'],
                    'probability':[True], 
                    'tol': [0.1, 0.01, 0.001, 0.0001]}
    best_svm_model = five_fold_CV(svm, tuned_parameters, X_train, Y_train)
    svm_y_test = best_svm_model.predict(X_test)
    return svm_y_test

def knn_pred(X_train, Y_train, X_test):
    neigh = KNeighborsClassifier()
    tuned_parameters = dict(n_neighbors=range(1, 30), weights=['uniform', 'distance'])
    best_knn_model = five_fold_CV(neigh, tuned_parameters, X_train, Y_train)
    neigh_y_pred = best_knn_model.predict(X_test)
    return neigh_y_pred

def randomforest_pred(X_train, Y_train, X_test):
    rf = RandomForestClassifier()
    
    tuned_parameters = {'n_estimators':[50, 100, 250,  500, 750, 1000, 1250, 1500, 2000], 
                    'criterion':['gini', 'entropy'], 
                    'max_features':['log2', 'auto', None], 
                    'min_samples_split':[2, 3, 4], 
                    'max_depth': [3, 6, 9, 12, 15, 18, 21, 24, 32, None], 
                    'min_samples_leaf':[1, 2], 
                    'max_leaf_nodes': [None, 5, 10],
                    'min_impurity_decrease':[0.1, 0.01, 0.001, 0.0001, 0.00001],
                    'bootstrap': [True, False],
                    'class_weight': [None, 'balanced', 'balanced_subsample']}
    best_rf_model = five_fold_CV(rf, tuned_parameters, X_train, Y_train)

    forest_y_pred = best_rf_model.predict(X_test)
    return forest_y_pred 

def classification_metrics(Y_pred, Y_true):
    #TODO: Calculate the above mentioned metrics
    #NOTE: It is important to provide the output in the same order
    precision = precision_score(Y_pred, Y_true, average='micro')
    recall = recall_score(Y_pred, Y_true, average='micro')
    f1score = f1_score(Y_pred, Y_true, average='micro')
    report = classification_report(Y_pred, Y_true)
    return precision, recall, f1score, report

def display_metrics(classifierName, Y_pred, Y_true):
    print("______________________________________________")
    print(("Classifier: "+classifierName))
    precision, recall, f1score, report = classification_metrics(Y_pred,Y_true)
    print(("Precision: "+str(precision)))
    print(("Recall: "+str(recall)))
    print(("F1-score: "+str(f1score)))
    print("______________________________________________")
    print(report)
    print("")

## For combined one-hot features: 3 in 1

In [23]:
# LR - window size (1,5)
for i in range(1,6):
    print('***********************************************************')
    print('Window Size:', i)
    display_metrics("Logistic Regression",logistic_regression_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

***********************************************************
Window Size: 1
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.97
Recall: 0.97
F1-score: 0.97
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.86      0.92         7
    against medical advice       0.98      0.99      0.98        88
antimitochondrial antibody       0.80      0.80      0.80         5

                  accuracy                           0.97       100
                 macro avg       0.93      0.88      0.90       100
              weighted avg       0.97      0.97      0.97       100


***********************************************************
Window Size: 2
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regressi

In [24]:
for i in range(1,6):
    display_metrics("SVM",svm_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: SVM
Precision: 0.91
Recall: 0.91
F1-score: 0.91
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.00      0.00      0.00         0
    against medical advice       0.99      0.92      0.95        96
antimitochondrial antibody       0.60      0.75      0.67         4

                  accuracy                           0.91       100
                 macro avg       0.53      0.56      0.54       100
              weighted avg       0.97      0.91      0.94       100


Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: SVM
Precision: 0.97
Recall: 0.97
F1-score: 0.97
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.80      0.73         5
    against medical advice       1.00      0.98      0.99        91
antimitochondrial antibody       0.80      1.00      0.89         4

                  accuracy                           0.97       100
                 macro avg       0.82      0.93      0.87       100
              weighted avg       0.98      0.97      0.97       100


Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: SVM
Precision: 0.96
Recall: 0.96
F1-score: 0.96
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.67      0.67         6
    against m

In [25]:
for i in range(1,6):
    display_metrics("K Nearest Neighbor",knn_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.91
Recall: 0.91
F1-score: 0.91
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.50      0.38      0.43         8
    against medical advice       0.99      0.96      0.97        92
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.91       100
                 macro avg       0.50      0.44      0.47       100
              weighted avg       0.95      0.91      0.93       100


Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.9
Recall: 0.9
F1-score: 0.9
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.33      0.25      0.29         8
    against medical advice       0.99      0.96      0.97        92
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.90       100
                 macro avg       0.44      0.40      0.42       100
              weighted avg       0.94      0.90      0.92       100


Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.68
Recall: 0.68
F1-score: 0.68
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.83      0.14      0.24        36
    against medical advice       0.71      0.98      0.82        64
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.68       100
                 macro avg       0.51      0.37      0.35       100
              weighted avg       0.75      0.68      0.61       100


Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.84
Recall: 0.84
F1-score: 0.8399999999999999
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.27      0.43        22
    against medical advice       0.88      1.00      0.93        78
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.84       100
                 macro avg       0.63      0.42      0.45       100
              weighted avg       0.90      0.84      0.82       100


Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.83
Recall: 0.83
F1-score: 0.83
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.83      0.24      0.37        21
    against medical advice       0.88      0.99      0.93        79
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.83       100
                 macro avg       0.57      0.41      0.43       100
              weighted avg       0.87      0.83      0.81       100




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
# for i in range(1,6):
#     display_metrics("RF",randomforest_pred(train_X[i],train_y[i],test_X[i]),test_y[1])

# For original features

In [27]:
features = {}
names = {}

for i in range(1,6):
    features[i],names[i]=one_hot_features_single('AMA', i, feature_no=1, replace=True)#not use ald as they have number of sense=1


In [28]:
from sklearn.model_selection import train_test_split

y=cr1['sense'].values

train_X = {} 
test_X = {}
train_y = {}
test_y = {}

for i in range(1,6):
    train_X[i],test_X[i],train_y[i],test_y[i] = train_test_split(features[i], y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
    

In [29]:
# LR - window size (1,5)
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("Logistic Regression",logistic_regression_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("SVM",svm_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("K Nearest Neighbor",knn_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

Window Size: 1
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.96
Recall: 0.96
F1-score: 0.96
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.80      0.73         5
    against medical advice       0.99      0.98      0.98        90
antimitochondrial antibody       0.80      0.80      0.80         5

                  accuracy                           0.96       100
                 macro avg       0.82      0.86      0.84       100
              weighted avg       0.96      0.96      0.96       100


Window Size: 2
Fitting 5 folds for each of 50 candidates, totalling 250 fits




______________________________________________
Classifier: Logistic Regression
Precision: 0.97
Recall: 0.97
F1-score: 0.97
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      1.00      0.80         4
    against medical advice       1.00      0.97      0.98        92
antimitochondrial antibody       0.80      1.00      0.89         4

                  accuracy                           0.97       100
                 macro avg       0.82      0.99      0.89       100
              weighted avg       0.98      0.97      0.97       100


Window Size: 3
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.96
Recall: 0.96
F1-score: 0.96
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: SVM
Precision: 0.96
Recall: 0.96
F1-score: 0.96
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.67      0.67         6
    against medical advice       1.00      0.98      0.99        91
antimitochondrial antibody       0.60      1.00      0.75         3

                  accuracy                           0.96       100
                 macro avg       0.76      0.88      0.80       100
              weighted avg       0.97      0.96      0.96       100


Window Size: 3
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: SVM
Precision: 0.96
Recall: 0.96
F1-score: 0.96
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.67      0.67         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.91
Recall: 0.91
F1-score: 0.91
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.36      0.47        11
    against medical advice       0.98      0.98      0.98        89
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.91       100
                 macro avg       0.55      0.45      0.48       100
              weighted avg       0.94      0.91      0.92       100


Window Size: 4
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.86
Recall: 0.86
F1-score: 0.8599999999999999
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.83      0.28      0.42        18
    against medical advice       0.91      0.99      0.95        82
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.86       100
                 macro avg       0.58      0.42      0.45       100
              weighted avg       0.90      0.86      0.85       100


Window Size: 5
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.63
Recall: 0.63
F1-score: 0.63
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.14      0.24        43
    against medical advice       0.64      1.00      0.78        57
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.63       100
                 macro avg       0.55      0.38      0.34       100
              weighted avg       0.80      0.63      0.55       100




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# For direction features

In [30]:
features = {}
names = {}

for i in range(1,6):
    features[i],names[i]=one_hot_features_single('AMA', i, feature_no=2, replace=True)#not use ald as they have number of sense=1

# split train and test

y=cr1['sense'].values

train_X = {} 
test_X = {}
train_y = {}
test_y = {}

for i in range(1,6):
    train_X[i],test_X[i],train_y[i],test_y[i] = train_test_split(features[i], y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
    
# Train model
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("Logistic Regression",logistic_regression_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("SVM",svm_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("K Nearest Neighbor",knn_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

Window Size: 1
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.98
Recall: 0.98
F1-score: 0.98
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.86      0.92         7
    against medical advice       0.99      0.99      0.99        89
antimitochondrial antibody       0.80      1.00      0.89         4

                  accuracy                           0.98       100
                 macro avg       0.93      0.95      0.93       100
              weighted avg       0.98      0.98      0.98       100


Window Size: 2
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.96
Recall: 0.96
F1-score: 0.96
______________________________________________
                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: SVM
Precision: 0.97
Recall: 0.97
F1-score: 0.97
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.80      0.73         5
    against medical advice       1.00      0.98      0.99        91
antimitochondrial antibody       0.80      1.00      0.89         4

                  accuracy                           0.97       100
                 macro avg       0.82      0.93      0.87       100
              weighted avg       0.98      0.97      0.97       100


Window Size: 3
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: SVM
Precision: 0.94
Recall: 0.94
F1-score: 0.94
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.57      0.62         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.9
Recall: 0.9
F1-score: 0.9
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.33      0.33      0.33         6
    against medical advice       0.99      0.94      0.96        94
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.90       100
                 macro avg       0.44      0.42      0.43       100
              weighted avg       0.95      0.90      0.92       100


Window Size: 3
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.75
Recall: 0.75
F1-score: 0.75
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.83      0.17      0.29        29
    against medical advice       0.79      0.99      0.87        71
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.75       100
                 macro avg       0.54      0.39      0.39       100
              weighted avg       0.80      0.75      0.70       100


Window Size: 4
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.64
Recall: 0.64
F1-score: 0.64
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.14      0.25        42
    against medical advice       0.65      1.00      0.79        58
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.64       100
                 macro avg       0.55      0.38      0.35       100
              weighted avg       0.80      0.64      0.56       100


Window Size: 5
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.48
Recall: 0.48
F1-score: 0.48
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.10      0.19        58
    against medical advice       0.47      1.00      0.64        42
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.48       100
                 macro avg       0.49      0.37      0.28       100
              weighted avg       0.78      0.48      0.38       100




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# For direction features with number

In [31]:
features = {}
names = {}

for i in range(1,6):
    features[i],names[i]=one_hot_features_single('AMA', i, feature_no=3, replace=True)#not use ald as they have number of sense=1

# split train and test

y=cr1['sense'].values

train_X = {} 
test_X = {}
train_y = {}
test_y = {}

for i in range(1,6):
    train_X[i],test_X[i],train_y[i],test_y[i] = train_test_split(features[i], y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
    
# Train model
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("Logistic Regression",logistic_regression_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("SVM",svm_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,6):
    print('Window Size:', i)
    display_metrics("K Nearest Neighbor",knn_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

Window Size: 1
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.98
Recall: 0.98
F1-score: 0.98
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.86      0.92         7
    against medical advice       0.99      0.99      0.99        89
antimitochondrial antibody       0.80      1.00      0.89         4

                  accuracy                           0.98       100
                 macro avg       0.93      0.95      0.93       100
              weighted avg       0.98      0.98      0.98       100


Window Size: 2
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.96
Recall: 0.96
F1-score: 0.96
______________________________________________
                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: SVM
Precision: 0.95
Recall: 0.95
F1-score: 0.9500000000000001
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.67      0.67         6
    against medical advice       0.99      0.97      0.98        91
antimitochondrial antibody       0.60      1.00      0.75         3

                  accuracy                           0.95       100
                 macro avg       0.75      0.88      0.80       100
              weighted avg       0.96      0.95      0.95       100


Window Size: 3
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: SVM
Precision: 0.95
Recall: 0.95
F1-score: 0.9500000000000001
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.88
Recall: 0.88
F1-score: 0.88
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.29      0.40        14
    against medical advice       0.94      0.98      0.96        86
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.88       100
                 macro avg       0.54      0.42      0.45       100
              weighted avg       0.91      0.88      0.88       100


Window Size: 3
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.66
Recall: 0.66
F1-score: 0.66
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.15      0.26        40
    against medical advice       0.67      1.00      0.81        60
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.66       100
                 macro avg       0.56      0.38      0.36       100
              weighted avg       0.80      0.66      0.59       100


Window Size: 4
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.79
Recall: 0.79
F1-score: 0.79
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       1.00      0.22      0.36        27
    against medical advice       0.82      1.00      0.90        73
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.79       100
                 macro avg       0.61      0.41      0.42       100
              weighted avg       0.87      0.79      0.76       100


Window Size: 5
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.85
Recall: 0.85
F1-score: 0.85
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.33      0.20      0.25        10
    against medical advice       0.93      0.92      0.93        90
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.85       100
                 macro avg       0.42      0.37      0.39       100
              weighted avg       0.87      0.85      0.86       100




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# For combined feature model(already decide windowsize=2)

In [32]:
features = {}
names = {}
num_used=[[1,2],[1,3],[2,3]]
features_used_double=[0,'original+direction','original+dir_num','direction+dir_num']
for i in range(1,4):#use 2 as window size
    features[i],names[i]=one_hot_features_double('AMA', 2, feature_no=num_used[i-1][0],feature_no_2=num_used[i-1][1], replace=True)#not use ald as they have number of sense=1

# split train and test

y=cr1['sense'].values

train_X = {} 
test_X = {}
train_y = {}
test_y = {}

for i in range(1,4):
    train_X[i],test_X[i],train_y[i],test_y[i] = train_test_split(features[i], y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
    
# Train model
for i in range(1,4):
    print('Features used:', features_used_double[i])
    display_metrics("Logistic Regression",logistic_regression_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,4):
    print('Features used:', features_used_double[i])
    display_metrics("SVM",svm_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

print('***********************************************************')
for i in range(1,4):
    print('Features used:', features_used_double[i])
    display_metrics("K Nearest Neighbor",knn_pred(train_X[i],train_y[i],test_X[i]),test_y[i])

Features used: original+direction
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.97
Recall: 0.97
F1-score: 0.97
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      1.00      0.80         4
    against medical advice       1.00      0.97      0.98        92
antimitochondrial antibody       0.80      1.00      0.89         4

                  accuracy                           0.97       100
                 macro avg       0.82      0.99      0.89       100
              weighted avg       0.98      0.97      0.97       100


Features used: original+dir_num
Fitting 5 folds for each of 50 candidates, totalling 250 fits
______________________________________________
Classifier: Logistic Regression
Precision: 0.97
Recall: 0.97
F1-score: 0.97
_____________________________________

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.9
Recall: 0.9
F1-score: 0.9
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.33      0.29      0.31         7
    against medical advice       0.99      0.95      0.97        93
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.90       100
                 macro avg       0.44      0.41      0.42       100
              weighted avg       0.94      0.90      0.92       100


Features used: direction+dir_num
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


______________________________________________
Classifier: K Nearest Neighbor
Precision: 0.83
Recall: 0.83
F1-score: 0.83
______________________________________________
                            precision    recall  f1-score   support

     advanced maternal age       0.67      0.21      0.32        19
    against medical advice       0.89      0.98      0.93        81
antimitochondrial antibody       0.00      0.00      0.00         0

                  accuracy                           0.83       100
                 macro avg       0.52      0.40      0.42       100
              weighted avg       0.85      0.83      0.81       100




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# error analysis in LR with 3 kinds features

In [33]:

features,names=one_hot_features('AMA', 2, replace=True)
train_X,test_X,train_y,test_y = train_test_split(features, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
y_pred=logistic_regression_pred(train_X, train_y, test_X)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [34]:
X_error_an=cr1['features'].values

In [35]:
train_X_err,test_X_err,train_y,test_y = train_test_split(X_error_an, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)


In [36]:
for k,v in enumerate(y_pred):
    if v!=test_y[k]:
        print('test_x')
        print(test_X_err[k])
        print('ture value:')
        print(test_y[k])
        print('pred value:')
        print(y_pred[k])
        

test_x
L5-sclerosing L4-cholangitis L3-but L2-if L1-an R1-has R2-not R3-been R4-done R5-this
ture value:
antimitochondrial antibody
pred value:
against medical advice
test_x
L5-weeks L4-ega L3-secondary L2-to L1-postdates R1-and R2-gestational R3-diabetes R4-diet R5-controlled
ture value:
advanced maternal age
pred value:
against medical advice
test_x
L2-she L1-is 
ture value:
advanced maternal age
pred value:
against medical advice


In [37]:
abbrev_freq.loc[abbrev_freq.abbrev == "AMA"]

Unnamed: 0,abbrev,number_sense,sense,freq,percentage
0,AMA,3,advanced maternal age,31,0.062
0,AMA,3,against medical advice,444,0.888
0,AMA,3,antimitochondrial antibody,25,0.05
