In [9]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split

## Full data analysis

In [2]:
corpus_data = pd.read_parquet("hf://datasets/nirantk/geneticsQA-corpus/data/train-00000-of-00001.parquet")

In [3]:
pd.set_option('display.max_colwidth', None)
display(corpus_data.sample(2))
pd.reset_option('display.max_colwidth')

Unnamed: 0,text
29387,"After removing IFs by calcination, electron microscopy revealed hollow silica nanotubes several micrometers long, with outer diameters of 35-55 nm and an average inner diameter of 10 nm (comparable to that of IFs)"
32684,"Lysyl oxidase-like 1, a crosslinking enzyme implicated in collagen and elastin biogenesis"


## Labeled data analysis

In [4]:
labeled_data = pd.read_parquet("hf://datasets/nirantk/geneticsQA-train/data/train-00000-of-00001.parquet")
print(f'Number of samples in the labeled_data: {len(labeled_data)}')
print(f"Columns in the labeled_data: {labeled_data.columns.tolist()}")
labeled_data['contexts'] = labeled_data['contexts'].apply(eval)


Number of samples in the labeled_data: 1659
Columns in the labeled_data: ['question', 'contexts', 'ground_truth', 'exact_answer']


In [5]:
labeled_data.head()

Unnamed: 0,question,contexts,ground_truth,exact_answer
0,What is Snord116?,[Further analysis with array-CGH identified a ...,['SNORD116 is a small nucleolar (sno) RNA gene...,[]
1,Are ultraconserved elements often transcribed?,[Starting from a genome-wide expression profil...,"['Yes. Especially, a large fraction of non-ex...",['yes']
2,List metalloenzyme inhibitors.,[ Clinically approved inhibitors were selected...,['Foscarnet\nVT-1129\nVT-1161 \nBB-3497\nhydro...,"['VT-1129', 'VT-1161', 'BB-3497', 'hydroxamate..."
3,Which protein phosphatase has been found to in...,"[ Moreover, protein phosphatase-1 activity is ...",['Protein phosphatase-1 activity is regulated ...,"['Protein phosphatase 1', 'PP1']"
4,Do DNA double-strand breaks play a causal role...,[The DNA non-homologous end-joining repair gen...,['Yes. It has been demonstrated that induction...,['yes']


In [6]:
# Sample row
pd.set_option('display.max_colwidth', None)
display(labeled_data[21:22])
pd.reset_option('display.max_colwidth')
print(f"Number of contexts: {len(labeled_data[21:22]['contexts'].item())}")


Unnamed: 0,question,contexts,ground_truth,exact_answer
21,"What is the causative agent of the ""Panama disease"" affecting bananas?","[Fusarium oxysporum f. sp. cubense (Foc), the causal agent of Fusarium wilt (Panama disease), is one of the most devastating diseases of banana (Musa spp.), avendish, the most widely grown banana cultivar, is relatively resistant to Race 1 of Fusarium oxysporum f. sp. cubense (Foc1) which caused widespread Panama disease during the first half of the 20th century but is susceptible to Tropical Race 4 of Foc (Foc TR4) which is threatening world banana production. , Fusarium oxysporum f. sp. cubense race 4 (FOC), the causal agent of Panama disease in banana,, Fusarium oxysporum f.sp. cubense, a causative agent of Panama disease, Fusarium wilt of banana (also known as Panama disease) is caused by Fusarium oxysporum f. sp. cubense, inoculated with Fusarium oxysporum f.sp. cubense (FOC), Race 4, the causal agent of Panama disease, the fungus causing Panama disease of banana, Panama disease of banana, caused by the fungus Fusarium oxysporum f. sp. cubense, is a serious constraint both to the commercial production of banana and cultivation for subsistence agriculture]",['Panama disease of banana is caused by the fungus Fusarium oxysporum f. sp. cubense.'],['Fusarium oxysporum f. sp. cubense']


Number of contexts: 8


In [7]:
# Sample row
pd.set_option('display.max_colwidth', None)
display(labeled_data[17:18])
pd.reset_option('display.max_colwidth')
print(f"Number of contexts: {len(labeled_data[17:18]['contexts'].item())}")

Unnamed: 0,question,contexts,ground_truth,exact_answer
17,What organism causes woolsorter's disease,"[Today, woolsorters' disease and other industrial manifestations of anthrax are extremely rare, , Working independently of their more famous counterparts (Robert Koch and Louis Pasteur), Anglo-American anthrax investigators used visual representations of anthrax bacilli to persuade their peers that a specific, identifiable cause produced all forms of anthrax-malignant pustule (cutaneous anthrax), intestinal anthrax, and woolsorter's disease (pneumonic anthrax). , Today, woolsorters' disease and other industrial manifestations of anthrax are extremely rare, but the increasing threat of bioterrorism means that the international dread and historical lessons of this significant condition should never be forgotten. , Today, woolsorters' disease and other industrial manifestations of anthrax are extremely rare, but the increasing threat of bioterrorism means that the international dread and historical lessons of this significant condition should never be forgotten.]","[""Woolsorter's disease is caused by the same organism as Anthrax, bacillus Anthrax. ""]",['Bacillus Anthracis']


Number of contexts: 4


## Split the labeled data into train, valid and test set

In [8]:
# Split the data into train, validation, and test sets
train_data, temp_data = train_test_split(labeled_data, test_size=0.2, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f'Number of samples in the train_data: {len(train_data)}')
print(f'Number of samples in the valid_data: {len(valid_data)}')
print(f'Number of samples in the test_data: {len(test_data)}')

Number of samples in the train_data: 1327
Number of samples in the valid_data: 166
Number of samples in the test_data: 166


## Export the corpus and the labeled datasets and the splits

In [11]:
data_directory = '/home/ubuntu/search-course/data/bio'
os.makedirs(data_directory, exist_ok=True)

# Export the full labeled data
labeled_data.to_csv(os.path.join(data_directory, 'labeled_data.csv'), index=False)

# Export the train, validation, and test splits
train_data.to_csv(os.path.join(data_directory, 'train_data.csv'), index=False)
valid_data.to_csv(os.path.join(data_directory, 'valid_data.csv'), index=False)
test_data.to_csv(os.path.join(data_directory, 'test_data.csv'), index=False)

# Export the corpus data
corpus_data.to_csv(os.path.join(data_directory, 'corpus_data.csv'), index=False)
print(f'Labeled data, splits, and corpus data have been exported to {data_directory}')

print(f'Number of samples in the labeled_data: {len(labeled_data)}')
print(f'Number of samples in the train_data: {len(train_data)}')
print(f'Number of samples in the valid_data: {len(valid_data)}')
print(f'Number of samples in the test_data: {len(test_data)}')
print(f'Number of samples in the corpus_data: {len(corpus_data)}')


Labeled data, splits, and corpus data have been exported to /home/ubuntu/search-course/data/bio
Number of samples in the labeled_data: 1659
Number of samples in the train_data: 1327
Number of samples in the valid_data: 166
Number of samples in the test_data: 166
Number of samples in the corpus_data: 35523
