In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))
from nltk.corpus import stopwords
import re
import time
from src.data_loader import get_combine_data

In [2]:
combined_data = get_combine_data()

In [3]:
# loading stop words from nltk library
STOP_WORDS = set(stopwords.words("english"))


def clean_text(text: str, stop_words: set[str]) -> str:
    if not isinstance(text, str):
        return text

    text = re.sub(r"[^a-zA-Z0-9\n]", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = text.lower()

    tokens = [
        word for word in text.split()
        if word not in stop_words
    ]

    return " ".join(tokens)

In [6]:
def create_processed_data(df):
    """
    Create a processed dataframe for EDA.
    - Replaces 'Text' with cleaned text
    - Raw 'Text' remains preserved in the original dataframe
    """

    start_time = time.perf_counter()

    processed_df = df.copy()

    # Replace Text with cleaned version
    processed_df["Text"] = processed_df["Text"].apply(
        lambda x: clean_text(x, STOP_WORDS)
    )

    elapsed = time.perf_counter() - start_time
    print(f"Text preprocessing completed in {elapsed:.2f} seconds")

    return processed_df

In [7]:
processed_data = create_processed_data(combined_data)
processed_data

Text preprocessing completed in 10.67 seconds


Unnamed: 0,ID,Text,Gene,Variation,Class
0,0,cyclin dependent kinases cdks regulate variety...,FAM58A,Truncating Mutations,1
1,1,abstract background non small cell lung cancer...,CBL,W802*,2
2,2,abstract background non small cell lung cancer...,CBL,Q249E,2
3,3,recent evidence demonstrated acquired uniparen...,CBL,N454D,3
4,4,oncogenic mutations monomeric casitas b lineag...,CBL,L399V,4
...,...,...,...,...,...
3316,3316,introduction myelodysplastic syndromes mds het...,RUNX1,D171N,4
3317,3317,introduction myelodysplastic syndromes mds het...,RUNX1,A122*,1
3318,3318,runt related transcription factor 1 gene runx1...,RUNX1,Fusions,1
3319,3319,runx1 aml1 gene frequent target chromosomal tr...,RUNX1,R80C,4
