# Exploring the data from NCBI test dataset

In [1]:
import pandas as pd
from collections import defaultdict 
import ast

In [2]:
ncbi_file_path = "ncbi_test.csv"

def convert_to_list(string):
    # Convert a list that is in string format to list
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None

ncbi_dataframe = pd.read_csv(ncbi_file_path, index_col = 0, header = 0, converters = {'tokens': convert_to_list, 'original_tags': convert_to_list})

In [3]:
ncbi_dataframe

Unnamed: 0,id,tokens,original_tags
0,0,"[Clustering, of, missense, mutations, in, the,...","[O, O, O, O, O, O, B-DIS, I-DIS, I-DIS, O, O, ..."
1,1,"[Ataxia, -, telangiectasia, (, A, -, T, ), is,...","[B-DIS, I-DIS, I-DIS, O, B-DIS, I-DIS, I-DIS, ..."
2,2,"[The, risk, of, cancer, ,, especially, lymphoi...","[O, O, O, B-DIS, O, O, B-DIS, I-DIS, O, O, O, ..."
3,3,"[By, analysing, tumour, DNA, from, patients, w...","[O, O, B-DIS, O, O, O, O, B-DIS, I-DIS, I-DIS,..."
4,4,"[In, marked, contrast, to, the, ATM, mutation,...","[O, O, O, O, O, O, O, O, O, B-DIS, I-DIS, I-DI..."
...,...,...,...
936,936,"[In, an, attempt, to, resolve, this, issue, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
937,937,"[These, reagents, detect, a, 220, -, kD, prote...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
938,938,"[Immunohistochemical, staining, of, human, bre...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
939,939,"[Conversely, ,, BRCA1, expression, was, reduce...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [4]:
# Count how many of each tag there is
counts = {"O": 0, "B-DIS": 0, "I-DIS": 0}
for idx, row in ncbi_dataframe.iterrows():
    for val in row["original_tags"]:
        counts[val] = counts[val] + 1
print(counts)

{'O': 22450, 'B-DIS': 960, 'I-DIS': 1087}


In [5]:
count_sum = counts["O"] + counts["B-DIS"] + counts["I-DIS"]
count_sum

24497

In [6]:
def count_entity_words(dataframe):
    entity_counts = defaultdict(int)
    
    for idx, row in dataframe.iterrows():
        current_entity = None
        word_count = 0
        tags = row["original_tags"]
        for tag in tags:
            if tag.startswith('B-'):
                if current_entity is not None: # add previous entity to dictonary
                    entity_counts[current_entity + "-" + str(word_count)] += 1
                current_entity = tag[2:]
                word_count = 1
            elif tag.startswith('I-'):
                word_count += 1
            else:# tag O
                if current_entity is not None:
                    # If we were counting an entity, save the count
                    entity_counts[current_entity + "-" + str(word_count)] += 1
                    current_entity = None
                    word_count = 0
        
        if current_entity is not None:
            entity_counts[current_entity + "-" + str(word_count)] += 1
    
    return entity_counts

In [7]:
count_entity_words(ncbi_dataframe)

defaultdict(int,
            {'DIS-3': 133,
             'DIS-5': 30,
             'DIS-1': 423,
             'DIS-2': 270,
             'DIS-6': 12,
             'DIS-7': 6,
             'DIS-4': 72,
             'DIS-8': 6,
             'DIS-9': 4,
             'DIS-13': 1,
             'DIS-11': 2,
             'DIS-14': 1})