In [1]:
pip install pandas


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd


In [3]:
RAW_PATH = r"C:\Users\satya\Downloads\biomedical-langgraph-rag\task\data\raw\pubmed_abstracts.csv"
OUTPUT_PATH = r"C:\Users\satya\Downloads\biomedical-langgraph-rag\task\data\processed\cleaned_pubmed.csv"


In [4]:
df = pd.read_csv(RAW_PATH)

print("Dataset shape:", df.shape)
print("Columns in dataset:")
df.columns


Dataset shape: (13200, 17)
Columns in dataset:


Index(['Unnamed: 0', 'deep_learning', 'covid_19', 'human_connectome',
       'virtual_reality', 'brain_machine_interfaces', 'electroactive_polymers',
       'pedot_electrodes', 'neuroprosthetics', 'deep_learning_links',
       'covid_19_links', 'human_connectome_links', 'virtual_reality_links',
       'brain_machine_interfaces_links', 'electroactive_polymers_links',
       'pedot_electrodes_links', 'neuroprosthetics_links'],
      dtype='object')

In [6]:
text_columns = [col for col in df.columns if not col.endswith("_links") and col != "Unnamed: 0"]
link_columns = [col for col in df.columns if col.endswith("_links")]

print("Text columns:")
print(text_columns)

print("\nLink columns (will be dropped):")
print(link_columns)


Text columns:
['deep_learning', 'covid_19', 'human_connectome', 'virtual_reality', 'brain_machine_interfaces', 'electroactive_polymers', 'pedot_electrodes', 'neuroprosthetics']

Link columns (will be dropped):
['deep_learning_links', 'covid_19_links', 'human_connectome_links', 'virtual_reality_links', 'brain_machine_interfaces_links', 'electroactive_polymers_links', 'pedot_electrodes_links', 'neuroprosthetics_links']


In [10]:
import ast

def normalize_text_value(value):
    """
    FINAL robust normalizer for this dataset.
    Handles stringified tuple(list(text)), list, tuple, NaN.
    """
    if pd.isna(value):
        return ""

    # Step 1: If already list or tuple
    if isinstance(value, (list, tuple)):
        value = value[0] if len(value) > 0 else ""

    # Step 2: If value is string
    if isinstance(value, str):
        value = value.strip()

        # Try parsing string into Python object
        try:
            parsed = ast.literal_eval(value)

            # If parsed is tuple -> unwrap
            if isinstance(parsed, tuple):
                parsed = parsed[0] if len(parsed) > 0 else ""

            # If parsed is list -> join
            if isinstance(parsed, list):
                return " ".join(str(v) for v in parsed)

            # If parsed is string
            if isinstance(parsed, str):
                return parsed

        except Exception:
            # If parsing fails, return raw string
            return value

    return str(value)


def combine_text(row):
    texts = []
    for col in text_columns:
        cleaned = normalize_text_value(row[col])
        if cleaned.strip():
            texts.append(cleaned)
    return " ".join(texts)


df["combined_text"] = df.apply(combine_text, axis=1)

df["combined_text"].head()


0    Magnetic resonance spectroscopic imaging (MRSI...
1    Existing deep convolutional neural networks (C...
2    Deep learning techniques have been increasingl...
3    The original article unfortunately contained a...
4    The most common applications of artificial int...
Name: combined_text, dtype: object

In [11]:
def clean_text(text):
    if pd.isna(text):
        return ""
    return (
        str(text)
        .replace("\n", " ")
        .replace("\t", " ")
        .strip()
    )

df["clean_text"] = df["combined_text"].apply(clean_text)


In [12]:
df = df[df["clean_text"].str.len() > 50]
print("Rows after cleaning:", df.shape)


Rows after cleaning: (13153, 19)


In [13]:
final_df = df[["clean_text"]].rename(columns={"clean_text": "text"})

print("Final dataset shape:", final_df.shape)
final_df.head()


Final dataset shape: (13153, 1)


Unnamed: 0,text
0,Magnetic resonance spectroscopic imaging (MRSI...
1,Existing deep convolutional neural networks (C...
2,Deep learning techniques have been increasingl...
3,The original article unfortunately contained a...
4,The most common applications of artificial int...


In [14]:
final_df.to_csv(
    r"C:\Users\satya\Downloads\biomedical-langgraph-rag\task\data\processed\cleaned_pubmed.csv",
    index=False
)

print("✅ Cleaned PubMed dataset saved successfully")


✅ Cleaned PubMed dataset saved successfully
