In [52]:
import pandas as pd
import re
import os

In [53]:
df_kaggle = pd.read_csv(
    "./data/raw/alldata_1_for_kaggle.csv",
    encoding="ISO-8859-1",
    on_bad_lines="skip",
    engine="python"
)

In [54]:
# Originally was in my MacOS for processsing

# df_kaggle = pd.read_csv(
#     "/Users/Rex/vscode/ai_lora_tuning/alldata_1_for_kaggle.csv",
#     encoding="ISO-8859-1",
#     on_bad_lines="skip",
#     engine="python"
# )

In [55]:
df_kaggle.head(50)

Unnamed: 0.1,Unnamed: 0,0,a
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
5,5,Thyroid_Cancer,This study was performed to explore the effec...
6,6,Thyroid_Cancer,This study was performed assess the clinical ...
7,7,Thyroid_Cancer,Journal of International Medical Research  Th...
8,8,Thyroid_Cancer,Gastric cancer GC persists as a worldwide pub...
9,9,Thyroid_Cancer,Scars Burns HealingVolume  reuse guideli...


In [56]:
df_kaggle.shape

(7570, 3)

In [57]:
unique = df_kaggle['0'].value_counts()
print(unique)

0
Thyroid_Cancer    2810
Colon_Cancer      2580
Lung_Cancer       2180
Name: count, dtype: int64


In [60]:
def clean_biomed_csv(input_csv_path: str, output_csv_path: str = None):
    """
    Cleans a biomedical text CSV by:
      - Handling non-UTF-8 encodings (cp1252, ISO-8859-1)
      - Stripping non-word symbols except punctuation and spaces
      - Saving the cleaned version as a new CSV
    """
    encodings_to_try = ["utf-8", "cp1252", "ISO-8859-1"]
    df = None
    for enc in encodings_to_try:
        try:
            df = pd.read_csv(input_csv_path, encoding=enc, on_bad_lines="skip", engine="python")
            print(f"Loaded CSV using encoding: {enc}")
            break
        except UnicodeDecodeError:
            print(f"Failed with encoding: {enc}, trying next...")
    
    if df is None:
        raise ValueError("Could not load the CSV with the tried encodings.")

    def clean_text(text):
        if pd.isna(text):
            return text
        text = str(text)

        # --- HARD REMOVE ALL DOUBLE QUOTES so """ cannot appear ---
        # (removes any sequence of one or more " characters)
        text = re.sub(r'"+', '', text)

        # keep your original cleaning
        text = re.sub(r"[^\w\s.,!?;:'\"()-]", " ", text)  # Remove unwanted characters (quotes already removed)
        text = re.sub(r"\s+", " ", text)                  # Normalize spaces
        return text.strip()

    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].apply(clean_text)

    if not output_csv_path:
        base, ext = os.path.splitext(input_csv_path)
        output_csv_path = f"{base}_cleaned{ext}"

    df.to_csv(output_csv_path, index=False)
    print(f"Cleaned CSV saved to: {output_csv_path}")

    return output_csv_path


# --------------------------
# Example usage in same notebook
# --------------------------
input_path = "./data/raw/alldata_1_for_kaggle.csv"
output_path = "./data/processed/alldata_1_for_kaggle_cleaned.csv"
cleaned_path = clean_biomed_csv(input_path, output_csv_path=output_path)

# Load the cleaned CSV to check
df_cleaned = pd.read_csv(cleaned_path)
df_cleaned.head()


Failed with encoding: utf-8, trying next...
Failed with encoding: cp1252, trying next...
Loaded CSV using encoding: ISO-8859-1
Cleaned CSV saved to: ./data/raw/alldata_1_for_kaggle_cleaned.csv


Unnamed: 0.1,Unnamed: 0,0,a
0,0,Thyroid_Cancer,Thyroid surgery in children in a single instit...
1,1,Thyroid_Cancer,The adopted strategy was the same as that used...
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï br...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an un...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix m...


In [61]:
df_cleaned.columns

Index(['Unnamed: 0', '0', 'a'], dtype='object')

In [62]:
df_cleaned = df_cleaned.drop(columns=['Unnamed: 0'])
df_cleaned = df_cleaned.rename(columns={'0': 'label', 'a': 'text'})

In [63]:
df_cleaned.head(50)

Unnamed: 0,label,text
0,Thyroid_Cancer,Thyroid surgery in children in a single instit...
1,Thyroid_Cancer,The adopted strategy was the same as that used...
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï br...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an un...
4,Thyroid_Cancer,This study aimed to investigate serum matrix m...
5,Thyroid_Cancer,This study was performed to explore the effect...
6,Thyroid_Cancer,This study was performed assess the clinical o...
7,Thyroid_Cancer,Journal of International Medical Research The ...
8,Thyroid_Cancer,Gastric cancer GC persists as a worldwide publ...
9,Thyroid_Cancer,Scars Burns HealingVolume reuse guidelinessage...


In [64]:
unique = df_cleaned['label'].value_counts()
print(unique)

label
Thyroid_Cancer    2810
Colon_Cancer      2580
Lung_Cancer       2180
Name: count, dtype: int64


In [65]:
df_cleaned.columns

Index(['label', 'text'], dtype='object')

In [66]:
# Save DataFrame to CSV
df_cleaned.to_csv("./data/processed/df_cleaned.csv", index=False)

# 10/10/25
# Was quick
# Next, is prepare_data.py