In [None]:
%pip install datasets

In [2]:
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the McGill-NLP/medal dataset in streaming mode
dataset = load_dataset('McGill-NLP/medal', split='train', streaming=True)

In [4]:
# Check if the dataset is defined and inspect a sample
print(type(dataset))  # Should print something like 'datasets.iterable_dataset.IterableDataset'
example = next(iter(dataset))  # Get the first example from the stream
print(example)  # Print the first example

<class 'datasets.iterable_dataset.IterableDataset'>
{'abstract_id': 14145090, 'text': 'velvet antlers vas are commonly used in traditional chinese medicine and invigorant and contain many PET components for health promotion the velvet antler peptide svap is one of active components in vas based on structural study the svap interacts with tgfÎ² receptors and disrupts the tgfÎ² pathway we hypothesized that svap prevents cardiac fibrosis from pressure overload by blocking tgfÎ² signaling SDRs underwent TAC tac or a sham operation T3 one month rats received either svap mgkgday or vehicle for an additional one month tac surgery induced significant cardiac dysfunction FB activation and fibrosis these effects were improved by treatment with svap in the heart tissue tac remarkably increased the expression of tgfÎ² and connective tissue growth factor ctgf ROS species C2 and the phosphorylation C2 of smad and ERK kinases erk svap inhibited the increases in reactive oxygen species C2 ctgf express

In [5]:
# Initialize variables to track null values
any_null = False
null_columns = {}

In [6]:
# Iterate through the streamed dataset
for example in dataset:
    # Check for null values in the current row
    for key, value in example.items():
        if value is None:
            any_null = True
            if key in null_columns:
                null_columns[key] += 1
            else:
                null_columns[key] = 1


In [5]:
# Print whether any null values were found
print("Any null values in the dataset:", any_null)

# If there are null values, print which columns have them and how many
if any_null:
    print("Columns with null values and their counts:")
    for column, count in null_columns.items():
        print(f"{column}: {count} null values")
else:
    print("No null values found in the dataset.")


Any null values in the dataset: False
No null values found in the dataset.


In [11]:
# Initialize variables to count rows and columns
row_count = 0
column_set = set()

# Iterate through the streamed dataset
for example in dataset:
    row_count += 1
    column_set.update(example.keys())  # Add columns to the set (unique columns)

# Convert the set to a list to get the number of unique columns
column_count = len(column_set)

# Print the results
print(f"Total rows in the dataset: {row_count}")
print(f"Total columns in the dataset: {column_count}")


Total rows in the dataset: 3000000
Total columns in the dataset: 4


In [13]:
# checking randomly for a value (in this case third record value)
# Initialize a counter
counter = 0
third_record = None

# Iterate through the dataset to get the 3rd record
for example in dataset:
    counter += 1
    if counter == 3:
        third_record = example
        break

# Print the 3rd record
if third_record:
    print("3rd Record:")
    print(third_record)
else:
    print("The dataset has fewer than 3 records.")


3rd Record:
{'abstract_id': 8625554, 'text': 'ceftobiprole bpr is an investigational cephalosporin with activity against staphylococcus aureus including methicillinresistant s aureus mrsa strains the pharmacodynamic pd profile of bpr against s aureus strains with a variety of susceptibility phenotypes in an immunocompromised murine pneumonia model was characterized the bpr mics of the test isolates ranged from to mugml pharmacokinetic pk studies were conducted with infected neutropenic balbc mice and the bpr concentrations were measured in plasma epithelial lining fluid elf and lung tissue pd studies with these mice were undertaken with eight s aureus isolates two MSSA strains three hospitalacquired mrsa strains and three CA mrsa strains subcutaneous bpr doses of to mgkg of body weightday were administered and the NC in the number of log cfuml in lungs was evaluated after h of therapy the pd profile was characterized by using the free drug exposures f determined from the following para

In [12]:
# Get the first example from the dataset to determine column names
first_example = next(iter(dataset))

# Print the column names
column_names = list(first_example.keys())
print("Column names:", column_names)


Column names: ['abstract_id', 'text', 'location', 'label']


In [8]:
# Example code to check for null values in each column
null_counts = {column: 0 for column in ['abstract_id', 'text', 'location', 'label']}
for example in dataset:
    for column in null_counts:
        if example[column] is None:
            null_counts[column] += 1
print("Null values per column:", null_counts)


Null values per column: {'abstract_id': 0, 'text': 0, 'location': 0, 'label': 0}


In [7]:
# Preview the first few records
preview_count = 5  # Number of records to preview
preview_data = [next(iter(dataset)) for _ in range(preview_count)]
for record in preview_data:
    print(record)


{'abstract_id': 14145090, 'text': 'velvet antlers vas are commonly used in traditional chinese medicine and invigorant and contain many PET components for health promotion the velvet antler peptide svap is one of active components in vas based on structural study the svap interacts with tgfÎ² receptors and disrupts the tgfÎ² pathway we hypothesized that svap prevents cardiac fibrosis from pressure overload by blocking tgfÎ² signaling SDRs underwent TAC tac or a sham operation T3 one month rats received either svap mgkgday or vehicle for an additional one month tac surgery induced significant cardiac dysfunction FB activation and fibrosis these effects were improved by treatment with svap in the heart tissue tac remarkably increased the expression of tgfÎ² and connective tissue growth factor ctgf ROS species C2 and the phosphorylation C2 of smad and ERK kinases erk svap inhibited the increases in reactive oxygen species C2 ctgf expression and the phosphorylation of smad and erk but not 

In [None]:
%pip install datasets transformers nltk


In [None]:
# Import libraries
import re
import nltk
from datasets import load_dataset

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [11]:
# Initialize required objects
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [18]:
# Define preprocessing function for text
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Define preprocessing function for each example
def preprocess_example(example):
    # Preprocess text column
    example['text'] = clean_text(example['text'])
    
    # Preprocess location column (e.g., handle missing values, standardize)
    if isinstance(example['location'], str):
        example['location'] = example['location'].strip().lower() if example['location'].strip() else 'unknown'
    else:
        example['location'] = 'unknown'
    
    # Optional: handle label column (e.g., encoding, balancing)
    return example



In [19]:
# Preprocess the dataset by applying the preprocess_example function
preprocessed_dataset = map(preprocess_example, dataset)



In [20]:
# Iterate through the first 5 preprocessed examples and print them
for i, example in enumerate(preprocessed_dataset):
    if i < 5:
        print(f"Example {i + 1}:\n", example, "\n")
    else:
        break



Example 1:
 {'abstract_id': 14145090, 'text': 'velvet antler va commonly used traditional chinese medicine invigorant contain many pet component health promotion velvet antler peptide svap one active component va based structural study svap interacts tgf receptor disrupts tgf pathway hypothesized svap prevents cardiac fibrosis pressure overload blocking tgf signaling sdrs underwent tac tac sham operation one month rat received either svap mgkgday vehicle additional one month tac surgery induced significant cardiac dysfunction fb activation fibrosis effect improved treatment svap heart tissue tac remarkably increased expression tgf connective tissue growth factor ctgf ro specie c phosphorylation c smad erk kinase erk svap inhibited increase reactive oxygen specie c ctgf expression phosphorylation smad erk tgf expression cultured cardiac fibroblast angiotensin ii ang ii similar effect compared tac surgery increase smapositive cf collagen synthesis svap eliminated effect disrupting tgf ib

In [21]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove specific unwanted characters but keep medical symbols or abbreviations
    text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [23]:
# Collect a sample of 'abstract_id' values
sample_abstract_ids = [example['abstract_id'] for example in dataset]

# Convert to a set to check uniqueness
unique_ids = len(set(sample_abstract_ids))
total_ids = len(sample_abstract_ids)

print(f"Total abstract_id entries: {total_ids}")
print(f"Unique abstract_id entries: {unique_ids}")
print(f"Are all abstract_id unique? {'Yes' if unique_ids == total_ids else 'No'}")


Total abstract_id entries: 3000000
Unique abstract_id entries: 2531051
Are all abstract_id unique? No


In [24]:
from collections import Counter

# Count occurrences of each abstract_id
id_counts = Counter(sample_abstract_ids)

# Identify duplicate IDs
duplicate_ids = {id_: count for id_, count in id_counts.items() if count > 1}
print(f"Number of duplicate abstract_id entries: {len(duplicate_ids)}")
print(f"Some duplicate abstract_ids and their counts: {list(duplicate_ids.items())[:10]}")


Number of duplicate abstract_id entries: 379515
Some duplicate abstract_ids and their counts: [(1900667, 3), (8625554, 2), (9441271, 2), (4015815, 2), (7599020, 2), (373515, 2), (4578351, 2), (5432442, 6), (3937052, 2), (2272969, 3)]


In [29]:
from collections import defaultdict

# Initialize a set to keep track of seen abstract_ids
seen_ids = set()
unique_entries = []

# Iterate over the dataset
for example in dataset:
    abstract_id = example['abstract_id']
    
    # Check if abstract_id is already seen
    if abstract_id not in seen_ids:
        # Add to set and list of unique entries
        seen_ids.add(abstract_id)
        unique_entries.append(example)

# If you need to process unique_entries further or save them
import pandas as pd

# Convert unique_entries to a DataFrame for further analysis
df_unique = pd.DataFrame(unique_entries)

# Print the number of unique entries
print(f"Number of unique entries: {len(df_unique)}")



Number of unique entries: 2531051
