In [48]:
from datasets import load_dataset
data_files = {"train": "/home/qxy699/Data/WHO_representative_random/who_dataset_training.csv", 
              "test": "/home/qxy699/Data/WHO_representative_random/who_dataset_test.csv"}
# \t is the tab character in Python
sars_dataset = load_dataset("csv", data_files=data_files)

In [49]:
sars_sample = sars_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
sars_sample[:1]

{'Unnamed: 0': [24473],
 'ID': ['EPI_ISL_4071265'],
 'sequence': ["['--------------------------------------actttcgatctcttgtagatctgttctctaaacgaactttaaaatctgtgtggctgtcactcggctgcatgcttagtgcactcacgcagtataattaataactaattactgtcgttgacaggacacgagtaactcgtctatcttctgcaggctgcttacggtttcgtccgttttgcagccgatcatcagcacatctaggttttgtccgggtgtgaccgaaaggtaagatggagagccttgtccctggtttcaacgagaaaacacacgtccaactcagtttgcctgttttacaggttcgcgacgtgctcgtacgtggctttggagactccgtggaggaggtcttatcagaggcacgtcaacatcttaaagatggcacttgtggcttagtagaagttgaaaaaggcgttttgcctcaacttgaacagccctatgtgttcatcaaacgttcggatgctcgaactgcacctcatggtcatgttatggttgagctggtagcagaactcgaaggcattcagtacggtcgtagtggtgagacacttggtgtccttgtccctcatgtgggcgaaataccagtggcttaccgcaaggttcttcttcgtaagaacggtaataaaggagctggtggccatagttacggcgccgatctaaagtcatttgacttaggcgacgagcttggcactgatccttatgaagattttcaagaaaactggaacactaaacatagcagtggtgttacccgtgaactcatgcgtgagcttaacggaggggcatacactcgctatgtcgataacaacttctgtggccctgatggctaccctcttgagtgcattaaagaccttctagcacgtgctggtaaagcttcatgcactttgtccgaacaactggactttatt

In [50]:
for split in sars_dataset.keys():
    assert len(sars_dataset[split]) == len(sars_dataset[split].unique("Unnamed: 0"))

In [51]:
sars_dataset = sars_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
sars_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'ID', 'sequence', 'Variant', 'Variant_VOC'],
        num_rows: 213974
    })
    test: Dataset({
        features: ['patient_id', 'ID', 'sequence', 'Variant', 'Variant_VOC'],
        num_rows: 213813
    })
})

In [52]:
len(sars_dataset["train"]["sequence"][0])

29895

In [47]:
len(sars_dataset["train"].unique("Variant_VOC"))

5

In [6]:
##without speeding up:
# def seperatre_spike(x):
#     return {"sequence": x["sequence"][21564:25386]}
# drug_dataset = drug_dataset.map(seperatre_spike) #spike region

Map: 100%|██████████| 213974/213974 [00:11<00:00, 18321.44 examples/s]
Map: 100%|██████████| 213813/213813 [00:12<00:00, 17732.32 examples/s]


In [53]:
#with speeding up: but each value is now a list of values, and not just a single value.
def seperatre_spike(x):
    return {"sequence": [o[21564:25386] for o in x["sequence"]]}
sars_dataset = sars_dataset.map(seperatre_spike, batched=True) #spike region:: WOW! it was done so fast!

In [54]:
len(sars_dataset["train"]["sequence"][0])

3822

In [28]:
len(sars_dataset.unique("sequence")) #variant

2

In [55]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["sequence"], truncation=True)

tokenized_dataset = sars_dataset.map(tokenize_function, batched=True)

In [56]:
tokenized_dataset["train"]["token_type_ids"][1]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [57]:
'''
In machine learning, an example is usually defined as the set of features that we feed to the model. In some contexts,
 these features will be the set of columns in a Dataset, but in others (like here and for question answering), multiple 
 features can be extracted from a single example and belong to a single column.

 Let’s have a look at how it works! Here we will tokenize our examples and truncate them to a maximum length of 128, 
 but we will ask the tokenizer to return all the chunks of the texts instead of just the first one. This can be done 
 with return_overflowing_tokens=True:
'''
def tokenize_and_split(examples):
    return tokenizer(
        examples["sequence"],
        truncation=True,
        max_length=8,
        return_overflowing_tokens=True,
    )

result = tokenize_and_split(sars_dataset["train"][0])
# [len(inp) for inp in result["input_ids"]]

In [58]:
result["input_ids"]

[[101, 100, 118, 118, 118, 118, 118, 102], [101, 118, 100, 102]]

our first example in the training set became "two features" because it was tokenized to more than the maximum number of tokens we specified: the first one of length 8 and the second one of length 4.

In [31]:
result

{'input_ids': [[101, 100, 118, 118, 118, 118, 118, 102], [101, 118, 100, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 0]}

In [66]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["sequence"],
        truncation=True,
        max_length=8,
        return_overflowing_tokens=False,
    )

tokenized_dataset = sars_dataset.map(
    tokenize_and_split, batched=True
)
tokenized_dataset

Map: 100%|██████████| 213974/213974 [00:13<00:00, 16235.41 examples/s]
Map: 100%|██████████| 213813/213813 [00:13<00:00, 16257.75 examples/s]


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'ID', 'sequence', 'Variant', 'Variant_VOC', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 213974
    })
    test: Dataset({
        features: ['patient_id', 'ID', 'sequence', 'Variant', 'Variant_VOC', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 213813
    })
})

In [59]:
tokenized_dataset = sars_dataset.map(
    tokenize_and_split, batched=True, remove_columns=sars_dataset["train"].column_names
)

In [60]:
len(tokenized_dataset["train"]), len(sars_dataset["train"])

(619831, 213974)

In [61]:
sars_dataset["train"]["sequence"][0] == sars_dataset["train"]["sequence"][100]

False

# From Dataset s to DataFrame s and back

In [62]:
sars_dataset.set_format("pandas")
sars_dataset["train"][:3]

Unnamed: 0,patient_id,ID,sequence,Variant,Variant_VOC
0,0,EPI_ISL_6330960,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Delta GK (B.1.617.2+AY.*) first detected i...,Delta
1,1,EPI_ISL_7565629,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Delta GK (B.1.617.2+AY.*) first detected i...,Delta
2,2,EPI_ISL_9077162,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Omicron GRA (B.1.1.529+BA.*) first detecte...,Omicron


In [23]:
train_df = sars_dataset["train"][:]
train_df

Unnamed: 0,patient_id,ID,sequence,Variant,Variant_VOC
0,0,EPI_ISL_6330960,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Delta GK (B.1.617.2+AY.*) first detected i...,Delta
1,1,EPI_ISL_7565629,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Delta GK (B.1.617.2+AY.*) first detected i...,Delta
2,2,EPI_ISL_9077162,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Omicron GRA (B.1.1.529+BA.*) first detecte...,Omicron
3,3,EPI_ISL_10009962,atgtttgttttttttgttttattgccactagtctctagtcagtgtg...,VOC Omicron GRA (B.1.1.529+BA.*) first detecte...,Omicron
4,4,EPI_ISL_10299306,atgtttgtttttcttgttttattgccactagtttctagtcagtgtg...,VOC Omicron GRA (B.1.1.529+BA.*) first detecte...,Omicron
...,...,...,...,...,...
213969,225495,EPI_ISL_9082936,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Omicron GRA (B.1.1.529+BA.*) first detecte...,Omicron
213970,225496,EPI_ISL_4134970,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Delta GK (B.1.617.2+AY.*) first detected i...,Delta
213971,225497,EPI_ISL_8213470,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,VOC Omicron GRA (B.1.1.529+BA.*) first detecte...,Omicron
213972,225498,EPI_ISL_9793508,atgtttgtttttcttgttttattgccactagtttctagtcagtgtg...,VOC Omicron GRA (B.1.1.529+BA.*) first detecte...,Omicron


In [63]:
 #reset the output format of sars_dataset from "pandas" to "arrow":
sars_dataset.reset_format()

In [64]:
#Creating a validation set
sars_dataset_clean = sars_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
sars_dataset_clean["validation"] = sars_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
sars_dataset_clean["test"] = sars_dataset["test"]
sars_dataset_clean


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'ID', 'sequence', 'Variant', 'Variant_VOC'],
        num_rows: 171179
    })
    validation: Dataset({
        features: ['patient_id', 'ID', 'sequence', 'Variant', 'Variant_VOC'],
        num_rows: 42795
    })
    test: Dataset({
        features: ['patient_id', 'ID', 'sequence', 'Variant', 'Variant_VOC'],
        num_rows: 213813
    })
})

In [None]:
#save dataset
# sars_dataset_clean.save_to_disk("drug-reviews")