In [66]:
import json
import re
import operator

GRAM_CUT_OFF = 5

In [49]:
def setVariable(variable, typeOf):
    """
    Returns default value of a type of variable when variable is None

    Params:
        variable (some_variable) : variable to check
        typeOf             (str) : type of the variable

    Returns:
        default_for_variable or variable : the needed value to set a classes instance variable
    """
    if variable is None:
        if typeOf == "str":
            return ""
        elif typeOf == "int":
            return 0
        elif typeOf == "bool":
            return False
        elif typeOf == "dict":
            return {}
        elif typeOf == "arr":
            return []
        elif typeOf == "float":
            return 0.0
    else:
        return variable

class Publication():
    def __init__(self, pid, title, journal, abstract, year, names):
        self.id = setVariable(pid, "int")
        self.title = setVariable(title, "str")
        self.journal = setVariable(journal, "str")
        self.abstract = setVariable(abstract, "str")
        self.year = setVariable(year, "int")
        self.names = setVariable(names, "arr")
        self.oneGrams = {}
        self.biGrams = {}
        self.triGrams = {}
        self.quadGrams = {}
        self.naiveBayesLabel = -1
        self.languageOfJournal = ""
        self.text = ""
    
    def setFullCleanedText(self):
        text = self.title + self.abstract
        self.text = cleanText(text)
    
    def __repr__(self):
        output = {
            "id" : self.id,
            "title" : self.title,
            "journal" : self.journal,
            "abstract" : self.abstract,
            "year" : self.year,
            "names" : self.names
        }
        
        return str(output)
        

In [14]:
pubs_raw = None
with open("disambiguation_challenge/asset-test.json") as file:
    pubs_raw = json.load(file)

In [35]:
mini_pubs_raw = None
with open("disambiguation_challenge/mini-asset-test.json") as file:
    mini_pubs_raw = json.load(file)

In [52]:
pubs = {}
for obj in mini_pubs_raw:
    pid = int(obj["publication_id"])
    title = obj["title"]
    journal = obj["journal"]
    abstract = obj["abstract"]
    year = int(obj["year"])
    names = obj["names"]
    pubs[obj["publication_id"]] = Publication(pid, title, journal, abstract, year, names)

In [53]:
print pubs[pubs.keys()[0]]

{'title': u'Peripartum cardiomyopathy--Obstetric emergency for mother and child.', 'journal': u'Zeitschrift fur Geburtshilfe und Neonatologie', 'abstract': u'This case report describes the diagnosis, treatment as well as maternal and fetal outcome of a pregnancy complicated by peripartum cardiomyopathy (PPCM). The article demonstrates criteria that define peripartum cardiomyopathy using clinical and echocardiographic features. In absence of preexisting heart disease an acute left ventricular dysfunction of the mother led to fetal bradycardia and immediate delivery. We discuss possible causes, clinical management and long term outcome in respect of the available literature. Future pregnancies should be avoided with at least 50 % risk of recurrence.', 'names': [{u'firstinitial': u'V', u'firstname': None, u'middlename': None, u'lastname': u'BRUCK', u'middleinitial': u'A', u'email': None}, {u'firstinitial': u'M', u'firstname': None, u'middlename': None, u'lastname': u'BUTTERWEGGE', u'middl

In [54]:
def regexCleanText(text):
    """
    Simple regex clean up of text
    Removes any punctuations that appear multiple times in a row
    Converts & to and

    Arguments:
        text (str) : text to be cleaned

    Returns:
        str : cleaned text
    """
    text = text.replace("\\\'","\'")
    text = re.sub(r'(\.){2,}', " ", text)
    text = re.sub(r'(\-){2,}', " ", text)
    text = re.sub(r'(\!){2,}', " ", text)
    text = re.sub(r'(\?){2,}', " ", text)
    text = re.sub(r'(\/){2,}', " ", text)
    text = re.sub(r'(\>){2,}', " ", text)
    text = re.sub(r'(\<){2,}', " ", text)
    text = re.sub(r'(\'){2,}', " ", text)
    text = re.sub(r'(\"){2,}', " ", text)
    text = re.sub(r'(\:){2,}', " ", text)
    text = re.sub(r'(\!){2,}', " ", text)
    text = re.sub(r'(\*){2,}', " ", text)
    text = re.sub(r'(\|){2,}', " ", text)
    text = re.sub(r'(\,){2,}', " ", text)
    text = re.sub(r'(\+){2,}', " ", text)
    text = re.sub(r'(\_){2,}', " ", text)
    text = re.sub(r'(\;){2,}', " ", text)
    text = text.replace("&"," and ")

    return text

def cleanText(text,uni=False,lower=True):
    """
    Function that cleans messy text

    Gets rid of many unwanted characters, can make everything lower as well, and can convert text to unicode

    Arguments:
        text   (str) : Text to be cleaned
        uni   (bool) : True if cleaned text should be returned as unicode, False otherwise
        lower (bool) : True if cleaned text should be returned as unicode, False otherwise

    Returns:
        (str | unicode) : cleaned text
    """
    # some regex stuff
    text = regexCleanText(text)
    text = re.sub(r'[^a-zA-Z0-9 \']',' ',text)
    temp = text.split()
    # making lower case
    if lower:
        for i in range(len(temp)):
            temp[i]=temp[i].lower()
    text=" ".join(temp)
    text.strip()
    # spacy needs unicode strings
    if uni:
        return text.decode('utf-8')
    else:
        return text

In [58]:
oneGrams = {}
biGrams = {}
triGrams = {}
quadGrams = {}

def getNgramCounts(text, n):
    grams = [text[i:i+n] for i in range(len(text)-n+1)]
    gramCounts = {}
    for gram in grams:
        if gram in gramCounts:
            gramCounts[gram] += 1
        else:
            gramCounts[gram] = 1
    
    return gramCounts
        

for key in pubs:
    pubs[key].setFullCleanedText()
    text = pubs[key].text
    
    unigrams = getNgramCounts(text, 1)
    for gram in unigrams:
        if gram in oneGrams:
            oneGrams[gram] += unigrams[gram]
        else:
            oneGrams[gram] = unigrams[gram]
    pubs[key].oneGrams = unigrams
    
    bigrams = getNgramCounts(text, 2)
    for gram in bigrams:
        if gram in biGrams:
            biGrams[gram] += bigrams[gram]
        else:
            biGrams[gram] = bigrams[gram]
    pubs[key].biGrams = bigrams
    
    trigrams = getNgramCounts(text, 3)
    for gram in trigrams:
        if gram in triGrams:
            triGrams[gram] += trigrams[gram]
        else:
            triGrams[gram] = trigrams[gram]
    pubs[key].triGrams = trigrams
    
    quadgrams = getNgramCounts(text, 4)
    for gram in quadgrams:
        if gram in quadGrams:
            quadGrams[gram] += quadgrams[gram]
        else:
            quadGrams[gram] = quadgrams[gram]
    pubs[key].quadGrams = quadgrams
    
    
    
    


In [59]:
oneGrams

{u' ': 2345,
 u"'": 3,
 u'0': 18,
 u'1': 37,
 u'2': 32,
 u'3': 19,
 u'4': 7,
 u'5': 15,
 u'6': 8,
 u'7': 11,
 u'8': 21,
 u'9': 17,
 u'a': 1013,
 u'b': 160,
 u'c': 588,
 u'd': 497,
 u'e': 1554,
 u'f': 269,
 u'g': 223,
 u'h': 492,
 u'i': 1107,
 u'j': 10,
 u'k': 57,
 u'l': 546,
 u'm': 329,
 u'n': 956,
 u'o': 900,
 u'p': 368,
 u'q': 14,
 u'r': 790,
 u's': 971,
 u't': 1198,
 u'u': 335,
 u'v': 137,
 u'w': 135,
 u'x': 42,
 u'y': 218,
 u'z': 30}

In [60]:
biGrams

{u' 1': 28,
 u' 2': 13,
 u' 3': 8,
 u' 4': 3,
 u' 5': 4,
 u' 6': 3,
 u' 7': 6,
 u' 8': 6,
 u' 9': 4,
 u' a': 241,
 u' b': 74,
 u' c': 167,
 u' d': 95,
 u' e': 80,
 u' f': 86,
 u' g': 22,
 u' h': 50,
 u' i': 174,
 u' j': 1,
 u' k': 9,
 u' l': 64,
 u' m': 85,
 u' n': 37,
 u' o': 159,
 u' p': 157,
 u' q': 7,
 u' r': 87,
 u' s': 188,
 u' t': 324,
 u' u': 15,
 u' v': 18,
 u' w': 101,
 u' x': 7,
 u' y': 9,
 u' z': 13,
 u"' ": 1,
 u"'s": 2,
 u'0 ': 12,
 u'00': 1,
 u'02': 1,
 u'03': 1,
 u'0s': 2,
 u'0t': 1,
 u'1 ': 6,
 u'10': 1,
 u'11': 1,
 u'12': 1,
 u'13': 6,
 u'14': 1,
 u'15': 4,
 u'16': 1,
 u'18': 6,
 u'19': 6,
 u'1c': 1,
 u'1h': 1,
 u'1n': 2,
 u'2 ': 17,
 u'20': 4,
 u'21': 2,
 u'23': 1,
 u'25': 2,
 u'26': 1,
 u'28': 2,
 u'2c': 1,
 u'2n': 1,
 u'2p': 1,
 u'3 ': 6,
 u'30': 3,
 u'32': 1,
 u'34': 2,
 u'36': 2,
 u'37': 1,
 u'39': 1,
 u'3c': 2,
 u'3n': 1,
 u'4 ': 3,
 u'42': 1,
 u'45': 1,
 u'47': 1,
 u'48': 1,
 u'5 ': 7,
 u'50': 2,
 u'53': 3,
 u'57': 1,
 u'5a': 1,
 u'5c': 1,
 u'6 ': 3,
 u'60': 2,

In [77]:
oneGrams = { k:v for k, v in oneGrams.items() if v > GRAM_CUT_OFF }
biGrams = { k:v for k, v in biGrams.items() if v > GRAM_CUT_OFF }
triGrams = { k:v for k, v in triGrams.items() if v > GRAM_CUT_OFF }
quadGrams = { k:v for k, v in quadGrams.items() if v > GRAM_CUT_OFF }

sortedUnigrams = sorted(oneGrams.items(), key=operator.itemgetter(1), reverse=True)
sortedBigrams = sorted(biGrams.items(), key=operator.itemgetter(1), reverse=True)
sortedTrigrams = sorted(triGrams.items(), key=operator.itemgetter(1), reverse=True)
sortedQuadgrams = sorted(quadGrams.items(), key=operator.itemgetter(1), reverse=True)

bigrams = sortedBigrams[len(sortedBigrams)/2 : ]
trigrams = sortedTrigrams[len(sortedTrigrams)/2 : ]
quadgrams = sortedQuadgrams[len(sortedQuadgrams)/2 :]

In [78]:
len(sortedTrigrams), len(trigrams)

(743, 372)