In [4]:
import nltk
nltk.download('punkt')
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import OrderedDict
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
data = pd.read_excel ('Functional_Consequences_v1.xlsx')
data.mutation_consequence.value_counts()

LOF     100
DN      100
GOF     100
none    100
HI      100
Name: mutation_consequence, dtype: int64

In [6]:
data.dropna(inplace=True)
data.mutation_consequence.value_counts()

LOF     100
DN      100
GOF      99
HI       99
none     84
Name: mutation_consequence, dtype: int64

In [7]:
search_terms = 'gain-of-function|gain of function|toxic gain of function|activating mutation|constitutively active|hypermorph|ectopic expression|neomorph|gain of interaction|function protein|fusion transcript|haploinsufficiency|haploinsufficient|hypomorph|amorph|null mutation|hemizygous|dominant-negative|dominant negative|antimorph|loss of function|loss-of-function'
search_terms = search_terms.split('|')

In [15]:
# Sentences: Text input from the dataframe
# scale : optional input, default: 2, indicates number of neighbours on either side
# search terms : optional input, default: search_terms

def extract_data(sentences, scale=2, search_terms = search_terms):

    sentences = sentences.lower() #converting into lowercase
    sentences = re.sub(r'[\r\n\n]+', ' ', sentences)#removing the new line, carriage return sequences

    locs = set()        #storing the locations of search term occurences
    sentences  = sent_tokenize(sentences.replace('al.',''))
    for i,sentence in enumerate(sentences):
        for term in search_terms:
            temp = re.findall(r'([^.]*'+term+'[^.]*)', sentence)
            if len(temp)>0:
                locs.add(i)

    extract=[]
    if not locs: #if no search terms are found returns the entire text
        return ''.join(sentences)

    for index in locs: #extracting the surrounding sentences
        left = sentences[:index][-scale:]
        right= sentences[index:scale+index+1]
        extract += left+right

    return (''.join(list(OrderedDict.fromkeys(extract))))#avoiding duplicate sentences while preserving the order of them
            

    

In [16]:
data['extracted_text'] = data.apply(lambda x: extract_data(x['text'],scale=1),axis=1)

In [17]:
data.head(5)

Unnamed: 0,phenotype_mim_num,gene_mim_num,Entrez Gene ID (NCBI),Approved Gene Symbol (HGNC),mutation_consequence,text,extracted_text
0,300983,300838,9758.0,FRMPD4,HI,In 5 affected males from a family (P58) with X...,in 5 affected males from a family (p58) with x...
1,618394,605394,60468.0,BACH2,DN,In a 19-year-old woman (family A) with immunod...,the l24p mutant protein was insoluble in solut...
2,229100,606806,10841.0,FTCD,HI,In 2 sibs with glutamate formiminotransferase ...,in 2 sibs with glutamate formiminotransferase ...
3,616324,100725,1145.0,CHRNE,HI,For discussion of the ser143-to-leu (S143L) mu...,one patient also had a heterozygous c.-24g-a t...
4,104300,104760,351.0,APP,none,In a cohort of 65 families with autosomal domi...,in a cohort of 65 families with autosomal domi...


In [18]:
data.to_excel("Data_v2.xlsx") 