# Installing needed libraries

In [8]:
!pip install nlpre



In [None]:
!pip install flashtext

In [None]:
!pip install unidecode

In [None]:
!pip install pyparsing

In [None]:
!pip install pattern

In [None]:
!pip install scrubadub

# Reading  all files from the frolder

In [18]:
from os.path import join
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile

In [19]:
path='train/'

files = [f for f in listdir(path) if isfile(join(path, f))]

len(files)

202

In [20]:
for line in open(join(path, files[0])).readlines(): print(line.strip())

<?xml version="1.0" encoding="UTF-8" ?>
<PatientMatching>
<TEXT><![CDATA[

Record date: 2106-02-12

Campbell Orthopedic Associates
4 Madera Circle
Omak, GA 28172

Habib Valenzuela, M.D.


Valdez, Harlan Jr.
845-41-54-4
February 12, 2106
Har is a 43 year old 6' 214 pound gentleman who is referred for
consultation by Dr. Harlan Oneil.  About a week ago he slipped on
the driveway at home and sustained an injury to his left ankle.
He was seen at Tri-City Hospital and was told he had a
fracture.  He was placed in an air splint and advised to be
partial weight bearing, and he is using a cane.  He is here for
routine follow-up.
Past medical history is notable for no ankle injuries previously.
He has a history of diabetes and sleep apnea.  He takes Prozac,
Cardizem, Glucophage and Amaryl.  He is also followed by Dr. Harold
Nutter for an arrhythmia.  He does not smoke.  He drinks
minimally.  He is a set designer at Columbia Pictures.

On examination today he has slight tenderness of the left an

# Data Cleaner function

This function pars the XML tree and get the text and labels and clean the text.

To clean the text:
* Remove the \*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\* string that sprates the records (For Now!)
* Remove the dates (For Now!)
* Remove all the other words that are not verb,noun,adjective
* Remove all the names
* Replace acronyms 


In [33]:
import xml.etree.ElementTree as ET
import re
from nlpre import titlecaps, dedash, identify_parenthetical_phrases
from nlpre import replace_acronyms, replace_from_dictionary, pos_tokenizer
import scrubadub


def xml2df(xml_data):

    #xml_data=re.sub(u"(\u2018|\u2019)", "'", xml_data)
    tree = ET.parse(xml_data) #Initiates the tree Ex: <user-agents>
    
    all_records = [] #This is our record list which we will convert into a dataframe
    headers = [] #Subchildren tags will be parsed and appended here
    record = []
    attrib=[]
    values=[]
    label=[]
    tex=""
    for elem in tree.iter(): #Begin looping through our root tree
        if(elem.tag!='PatientMatching' and elem.tag!='TAGS'):
            headers.append(elem.tag)
            record.append(elem.text)
            attrib.append(elem.attrib.get('met'))
            
    text = re.sub("\n\*+\n", "", record[0])
    text = re.sub("Record date: \d\d\d\d-\d\d-\d\d", "", text)

    v = ['pronoun','quote', 'symbol', 'adverb', 'unknown', 'punctuation', 'connector', 'modal_verb', 'cardinal', 'w_word']
    ABBR = identify_parenthetical_phrases()(text)
    parsers = [dedash(), titlecaps(), replace_acronyms(ABBR),pos_tokenizer(v)]

    for f in parsers:
        text = f(text)
    res=text.text
    res=scrubadub.clean(res, replace_with='identifier')
    
    res = re.sub("{{NAME-\d+}}", "", res)
    
    all_records.append(res)
    for a in attrib[1:]:
        if(a == 'met'):
            label.append(1)
        else:
             label.append(0)
    all_records.append(label)
    return pd.DataFrame([all_records], columns=['text','label']) 

## Read files and clean them

In [22]:
from __future__ import print_function

data = None
i=0
for file in files:
    i=i+1
    print(str(i)+"/"+str(len(files)), end='\r')
    df = xml2df(join(path,file))
    if data is None: data = df
    else: data = data.append(df,ignore_index=True)
    
data = data.convert_objects(convert_numeric=True)

202/202

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if sys.path[0] == '':


In [23]:
data[:3]

Unnamed: 0,text,label
0,associate circle .D. Jr. 845-41-54-4 ...,"[0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1]"
1,MERCY care CENTER associate AR 72985 M...,"[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]"
2,Personal Overall be 81 yr\nold male present...,"[1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0]"


# Split the text into Words

In [24]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) 

data['split'] = data.text.map(lambda x: tknzr.tokenize(x.lower()))

### Remove all the other charachter that are not words (Removing numbers)

In [25]:
data.split=data.split.map(lambda x: [w for w in x if w.isalpha()])

### Remove all 2 char words

In [26]:
data.split=data.split.map(lambda x: [w for w in x if len(w)>2])

# Stem and Encod the Words

In [27]:
import nltk
from sklearn import preprocessing
we = preprocessing.LabelEncoder()
pstemmer = nltk.lancaster.LancasterStemmer()

vocab  = set([pstemmer.stem(i) for x in data.split for i in x])

In [28]:
len(vocab)

9426

In [29]:
we.fit(list(vocab)) 

LabelEncoder()

In [30]:
data['ind'] = data.split.map(lambda x: [we.transform([pstemmer.stem(i)])[0] for i in x])

In [31]:
data[:3]

Unnamed: 0,text,label,split,ind
0,associate circle .D. Jr. 845-41-54-4 ...,"[0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1]","[associate, circle, year, old, pound, gentlema...","[552, 1419, 9356, 5815, 6504, 3301, 7026, 1691..."
1,MERCY care CENTER associate AR 72985 M...,"[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]","[mercy, care, center, associate, come, further...","[5059, 1169, 1289, 552, 1575, 3213, 2786, 4334..."
2,Personal Overall be 81 yr\nold male present...,"[1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0]","[personal, overall, old, male, present, multip...","[6245, 5963, 5815, 4901, 6576, 5334, 8531, 286..."


# Create a pickle from the clean data

In [32]:
data.to_pickle('data-vna.pkl')

## Labels Distribution

In [34]:
met=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
notmet=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
for x in data.label:
    for i in range(0,len(met)):
        if(x[i]==1):
            met[i]=met[i]+1
        else:
            notmet[i]=notmet[i]+1

In [35]:
hd=headers[1:14]

NameError: name 'headers' is not defined

In [None]:
for i in range(0,len(met)):
    print('{:d}{:-<20}{:6.2f}'.format(i,". "+hd[i],(met[i]/200)*100))