1. Configuration

In [5]:
import os
import pandas as pd
import random
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Configuration / Settings
DATASETS_FOLDER = 'datasets'
OUTPUT_FILE = 'processed_data.csv'
CHUNK_SIZE = 150
SAMPLE_SIZE = 200

RANDOM_SEED = 51

# Get NLTK English stopwords
STOP_WORDS = set(stopwords.words('english'))
LABEL_RELATED_WORDS = {
    'electronic health records', 'health records', 'EHR', 'ehr',
    'healthcare robotics', 
    'medical imaging',  
    'precision medicine', 
    'telemedicine', 
}

2. Process Data Files

In [None]:
# Initialize list to store results
results = []
lemmatizer = WordNetLemmatizer()

# Check if folder exists
if not os.path.exists(DATASETS_FOLDER):
    print(f"Error: Folder '{DATASETS_FOLDER}' not found.")
else:
    # Get list of CSV files
    files = [f for f in os.listdir(DATASETS_FOLDER) if f.endswith('.csv')]
    print(f"Found {len(files)} CSV files in '{DATASETS_FOLDER}':\n")

    for filename in files:
        file_path = os.path.join(DATASETS_FOLDER, filename)
        label = os.path.splitext(filename)[0]
        
        print(f"--- Processing: {filename} ---")
        
        try:
            # Read CSV
            df = pd.read_csv(file_path)
            
            # Check for required columns
            if 'Abstract' not in df.columns or 'Title' not in df.columns:
                print(f"  [Skipped] Required columns 'Abstract' or 'Title' missing.")
                continue
            
            # Remove rows where Abstract is NA or '[No abstract available]'
            initial_count = len(df)
            df = df.dropna(subset=['Abstract'])
         
            df = df[df['Abstract'].str.strip().str.lower() != '[no abstract available]']
            
            filtered_count = len(df)
            if filtered_count < initial_count:
                print(f"  Filtered out {initial_count - filtered_count} records with missing or '[No abstract available]' content.")

            total_docs = len(df)
            
            # Randomly sample 200 documents
            if total_docs > SAMPLE_SIZE:
                sampled_df = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)
                print(f"  Sampling: Selected {SAMPLE_SIZE} out of {total_docs} documents.")
            else:
                sampled_df = df
                print(f"  Taking all: Used all {total_docs} documents.")
            
            # Process Abstract content
            file_record_count = 0
            for _, row in sampled_df.iterrows():
                abstract = row['Abstract']
                paper_name = row['Title']
                
                # Fetch additional fields
                # Using .get() to avoid errors if columns are missing in some files
                doc_type = row.get('Document Type', '')
                affiliations = row.get('Affiliations', '')

                # Handle missing or non-string abstracts (Double check)
                if pd.isna(abstract) or not isinstance(abstract, str):
                    continue
                
                # Split into words
                words = abstract.split()
                
                # 1. Take first 150 words.
                # 2. Truncate if longer.
                # 3. Pad if shorter.
                
                if len(words) >= CHUNK_SIZE:
                    chunk = words[:CHUNK_SIZE]
                else:
                    # Pad with a placeholder token
                    chunk = words + ['[PAD]'] * (CHUNK_SIZE - len(words))
                
                chunk_text = ' '.join(chunk)
                
                results.append({
                    'Content': chunk_text,
                    'Paper Name': paper_name,
                    'Label': label,
                    'Document Type': doc_type,
                    'Affiliations': affiliations
                })
                file_record_count += 1
            
            print(f"  > Generated {file_record_count} valid records from this file.\n")

        except Exception as e:
            print(f"  [Error] Failed to read {filename}: {e}\n")

print(f"Processing complete. Total records collected: {len(results)}")

Found 5 CSV files in 'datasets':

--- Processing: electronic health records.csv ---
  Sampling: Selected 200 out of 20000 documents.
  > Generated 200 valid records from this file.

--- Processing: healthcare robotics.csv ---
  Sampling: Selected 200 out of 7861 documents.
  > Generated 200 valid records from this file.

--- Processing: medical imaging.csv ---
  Sampling: Selected 200 out of 20000 documents.
  > Generated 200 valid records from this file.

--- Processing: precision medicine.csv ---
  Sampling: Selected 200 out of 20000 documents.
  > Generated 200 valid records from this file.

--- Processing: telemedicine.csv ---
  Sampling: Selected 200 out of 20000 documents.
  > Generated 200 valid records from this file.

Processing complete. Total records collected: 1000


3. Clean Data

In [7]:
# define a function to clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove URLs, emails, and HTML tags if any
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    
    # 3. Remove special characters and numbers (keep letters and spaces)
    # The regex [^a-zA-Z\s] means replace all characters except letters and whitespace with a space
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # 4. Tokenize and remove stopwords, junk characters, lemmatize
    words = text.split()
    cleaned_words = []
    
    for word in words:
        # Remove words that are too short (e.g. single letters, except 'a', 'i', etc., usually meaningless, filtered here)
        if len(word) < 2:
            continue
            
        lemma_word = lemmatizer.lemmatize(word)

        # Prioritize checking if masking is needed (Before stopword check to ensure we mask even if it somehow was a stopword, though unlikely)
        if word in LABEL_RELATED_WORDS or lemma_word in LABEL_RELATED_WORDS:
            cleaned_words.append('[Cluster]')
            continue

        # Check if in stop words list (using base stop_words)
        if word not in STOP_WORDS and lemma_word not in STOP_WORDS:
            cleaned_words.append(lemma_word)
            
    return " ".join(cleaned_words)

if results:
    result_df = pd.DataFrame(results)

    result_df["Cleaned_Content"] = result_df["Content"].apply(clean_text)
    

else:
    print("[Warning] No data generated. Review the source files and logic.")

4. Save the File

In [8]:
if results:
# --- Display Info ---
    print(f"Total Rows: {result_df.shape[0]}")
    print(f"Total Columns: {result_df.shape[1]}")
    
    print("\n--- Class Distribution (Records per Label) ---")
    print(result_df['Label'].value_counts())
    
    print("\n--- First 5 Records ---")
    try:
        display(result_df.head())
    except NameError:
        print(result_df.head())

    # Save to CSV
    result_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
    print(f"\n[Success] Data saved to: {OUTPUT_FILE}")

Total Rows: 1000
Total Columns: 6

--- Class Distribution (Records per Label) ---
Label
electronic health records    200
healthcare robotics          200
medical imaging              200
precision medicine           200
telemedicine                 200
Name: count, dtype: int64

--- First 5 Records ---


Unnamed: 0,Content,Paper Name,Label,Document Type,Affiliations,Cleaned_Content
0,Objective: This study aimed to explore the cha...,Exploring the Challenges Student Pharmacists C...,electronic health records,Article,,objective study aimed explore challenge pharma...
1,Introduction: Breast cancer accounted for 21.9...,Ten-year survival in early-stage breast cancer...,electronic health records,Article,,introduction breast cancer accounted cancer de...
2,Introduction: There has been a steady decline ...,Association Between Opioid Dosage Tapering and...,electronic health records,Article,,introduction steady decline national opioid di...
3,Objective: Social determinants of health (SDoH...,Screening for Social Determinants of Health in...,electronic health records,Article,,objective social determinant health sdoh impac...
4,Background: The Medical Informatics Initiative...,Leveraging Interoperable Electronic Health Rec...,electronic health records,Article,,background medical informatics initiative mii ...



[Success] Data saved to: processed_data.csv
