In [2]:
!pip install datasets

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [29]:
import html
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, load_from_disk

In [2]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2023-11-29 14:33:06--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [             <=>    ]  41.00M  13.0MB/s    in 3.2s    

2023-11-29 14:33:10 (13.0 MB/s) - ‘drugsCom_raw.zip’ saved [42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [5]:
data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 13231.24it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 25.79it/s]
Generating train split: 161297 examples [00:01, 98242.99 examples/s] 
Generating test split: 53766 examples [00:00, 81520.23 examples/s] 


In [6]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [7]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [8]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

Filter: 100%|██████████| 161297/161297 [00:01<00:00, 133319.43 examples/s]
Filter: 100%|██████████| 53766/53766 [00:00<00:00, 124706.75 examples/s]


In [10]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset = drug_dataset.map(lowercase_condition)

Map: 100%|██████████| 160398/160398 [00:20<00:00, 7949.74 examples/s]
Map: 100%|██████████| 53471/53471 [00:06<00:00, 8018.00 examples/s]


In [11]:
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [12]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)

Map: 100%|██████████| 160398/160398 [00:16<00:00, 9826.10 examples/s] 
Map: 100%|██████████| 53471/53471 [00:05<00:00, 9942.07 examples/s] 


In [13]:
# Inspect the first training example
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [14]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [15]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter: 100%|██████████| 160398/160398 [00:01<00:00, 122161.27 examples/s]
Filter: 100%|██████████| 53471/53471 [00:00<00:00, 115916.74 examples/s]

{'train': 138514, 'test': 46108}





In [17]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Map: 100%|██████████| 138514/138514 [00:20<00:00, 6922.58 examples/s]
Map: 100%|██████████| 46108/46108 [00:06<00:00, 6728.00 examples/s]


In [19]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 187kB/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.18MB/s]
tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.61MB/s]


In [20]:
def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [21]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 138514/138514 [00:28<00:00, 4898.42 examples/s]
Map: 100%|██████████| 46108/46108 [00:09<00:00, 4843.94 examples/s]

CPU times: user 1min 35s, sys: 485 ms, total: 1min 35s
Wall time: 37.8 s





In [22]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [23]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

Map: 100%|██████████| 138514/138514 [00:33<00:00, 4098.78 examples/s]
Map: 100%|██████████| 46108/46108 [00:10<00:00, 4209.45 examples/s]


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

In [25]:
drug_dataset.set_format("pandas")

In [26]:
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [27]:
train_df = drug_dataset["train"][:]

In [28]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,frequency,count
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [30]:
freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['frequency', 'count'],
    num_rows: 819
})

In [31]:
drug_dataset.reset_format()

In [32]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [33]:
drug_dataset_clean.save_to_disk("drug-reviews")

Saving the dataset (1/1 shards): 100%|██████████| 110811/110811 [00:01<00:00, 78038.22 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 27703/27703 [00:00<00:00, 64267.89 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 46108/46108 [00:00<00:00, 182057.87 examples/s]


In [34]:
drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})