In [1]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2024-06-17 10:48:02--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [   <=>              ]  41.00M   957KB/s    in 64s     

2024-06-17 10:49:08 (653 KB/s) - ‘drugsCom_raw.zip’ saved [42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [2]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 161297 examples [00:00, 211097.17 examples/s]
Generating test split: 53766 examples [00:00, 219018.58 examples/s]


In [3]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [5]:
def filter_nones(x):
    return x["condition"] is not None
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)


Filter: 100%|██████████| 161297/161297 [00:00<00:00, 192175.99 examples/s]
Filter: 100%|██████████| 53766/53766 [00:00<00:00, 198658.48 examples/s]


In [6]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

Map: 100%|██████████| 160398/160398 [00:12<00:00, 12837.71 examples/s]
Map: 100%|██████████| 53471/53471 [00:04<00:00, 12888.36 examples/s]


{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [7]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter: 100%|██████████| 160398/160398 [00:00<00:00, 176250.21 examples/s]
Filter: 100%|██████████| 53471/53471 [00:00<00:00, 182096.45 examples/s]

{'train': 138514, 'test': 46108}





In [9]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Map: 100%|██████████| 138514/138514 [00:11<00:00, 11659.18 examples/s]
Map: 100%|██████████| 46108/46108 [00:03<00:00, 11718.11 examples/s]


In [10]:
from transformers import AutoTokenizer
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

Map: 100%|██████████| 138514/138514 [00:00<00:00, 442387.85 examples/s]
Map: 100%|██████████| 46108/46108 [00:00<00:00, 428156.40 examples/s]


In [11]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

[128, 49]

In [12]:
result

{'input_ids': [[101, 107, 1422, 1488, 1110, 9079, 1194, 1117, 2223, 1989, 1104, 1130, 19972, 11083, 119, 1284, 1245, 4264, 1165, 1119, 1310, 1142, 1314, 1989, 117, 1165, 1119, 1408, 1781, 1103, 2439, 13753, 1119, 1209, 1129, 1113, 119, 1370, 1160, 1552, 117, 1119, 1180, 6374, 1243, 1149, 1104, 1908, 117, 1108, 1304, 172, 14687, 1183, 117, 1105, 7362, 1111, 2212, 129, 2005, 1113, 170, 2797, 1313, 1121, 1278, 12020, 113, 1304, 5283, 1111, 1140, 119, 114, 146, 1270, 1117, 3995, 1113, 6356, 2106, 1105, 1131, 1163, 1106, 6166, 1122, 1149, 170, 1374, 1552, 119, 3969, 1293, 1119, 1225, 1120, 1278, 117, 1105, 1114, 2033, 1146, 1107, 1103, 2106, 119, 1109, 1314, 1160, 1552, 1138, 1151, 2463, 1714, 119, 1124, 1110, 150, 21986, 3048, 1167, 5340, 1895, 1190, 1518, 102], [101, 119, 1124, 1110, 1750, 6438, 113, 170, 1363, 1645, 114, 117, 1750, 172, 14687, 1183, 119, 1124, 1110, 11566, 1155, 1103, 1614, 1119, 1431, 119, 8007, 1117, 4658, 1110, 1618, 119, 1284, 1138, 1793, 1242, 1472, 23897, 1105, 117

In [13]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]


ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1463

The problem is that we’re trying to mix two different datasets of different sizes:
the drug_dataset columns will have a certain number of examples (the 1,000 in our error), 
but the tokenized_dataset we are building will have more (the 1,463 in the error message;
it is more than 1,000 because we are tokenizing long reviews into more than one example
by using return_overflowing_tokens=True). That doesn’t work for a Dataset, so we need to either remove the columns from the old dataset or make them the same size as they are in the new dataset. We can do the former with the remove_columns argument:

In [15]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map: 100%|██████████| 138514/138514 [00:10<00:00, 13399.40 examples/s]
Map: 100%|██████████| 46108/46108 [00:03<00:00, 13464.63 examples/s]


In [16]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

(206772, 138514)

In [17]:
#we can also deal with the mismatched length problem by making the old columns
#the same size as the new ones. we will need the overflow_to_sample_mapping
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

Map: 100%|██████████| 138514/138514 [00:11<00:00, 12131.41 examples/s]
Map: 100%|██████████| 46108/46108 [00:03<00:00, 12294.42 examples/s]


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

# From Dataset s to DataFrame s and back

In [18]:
drug_dataset.set_format("pandas")
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [35]:
train_df = drug_dataset["train"][:]

In [41]:
list(train_df["drugName"].unique())

['Guanfacine',
 'Lybrel',
 'Ortho Evra',
 'Buprenorphine / naloxone',
 'Cialis',
 'Aripiprazole',
 'Keppra',
 'Ethinyl estradiol / levonorgestrel',
 'Topiramate',
 'L-methylfolate',
 'Pentasa',
 'Dextromethorphan',
 'Nexplanon',
 'Liraglutide',
 'Trimethoprim',
 'Amitriptyline',
 'Lamotrigine',
 'Nilotinib',
 'Atripla',
 'Trazodone',
 'Etonogestrel',
 'Etanercept',
 'Tioconazole',
 'Azithromycin',
 'Eflornithine',
 'Daytrana',
 'Ativan',
 'Imitrex',
 'Sertraline',
 'Toradol',
 'Dulcolax',
 'Morphine',
 'MoviPrep',
 'Trilafon',
 'Fluconazole',
 'Contrave',
 'Clonazepam',
 'Venlafaxine',
 'Ledipasvir / sofosbuvir',
 'Tamsulosin',
 'Doxycycline',
 'Dulaglutide',
 'Intuniv',
 'Buprenorphine',
 'Pyridium',
 'Latuda',
 'Implanon',
 'Effexor XR',
 'Drospirenone / ethinyl estradiol',
 'NuvaRing',
 'Prepopik',
 'Tretinoin',
 'Ethinyl estradiol / norgestimate',
 'Elbasvir / grazoprevir',
 'Clomiphene',
 'Amitiza',
 'Sildenafil',
 'Lo Loestrin Fe',
 'Oxcarbazepine',
 'Wellbutrin',
 'Levonorgestre

In [51]:
df_drug = (train_df[["drugName", "rating"]]
           .groupby("drugName")
           .mean()
)
df_drug

Unnamed: 0_level_0,rating
drugName,Unnamed: 1_level_1
A + D Cracked Skin Relief,10.000000
A / B Otic,10.000000
Abacavir / dolutegravir / lamivudine,7.953488
Abacavir / lamivudine / zidovudine,9.000000
Abatacept,7.312500
...,...
Zyvox,9.200000
ZzzQuil,4.000000
depo-subQ provera 104,1.000000
ella,6.847826


In [22]:
frequencies = (
    train_df["drugName"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,frequency,count
0,Birth Control,27655
1,Depression,8023
2,Acne,5209
3,Anxiety,4991
4,Pain,4744


In [25]:
averaging = (
    train_df["rating"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "rate", "condition": "average"})
)
averaging.head()

Unnamed: 0,rating,count
0,10.0,42809
1,9.0,24126
2,1.0,17842
3,8.0,16663
4,7.0,8384


In [52]:
# reset the output format of drug_dataset from "pandas" to "arrow":
drug_dataset.reset_format()

In [53]:
#Creating a validation set
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [None]:
#save dataset
# drug_dataset_clean.save_to_disk("drug-reviews")