PES2UG22CS542_SHRIHARSHA MOGRA_TEXT PREPROCESSING

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

# Make sure required NLTK resources are available
nltk.download('stopwords')
nltk.download('wordnet')

# =========================
# 1. Load Dataset
# =========================
file_path = "/content/drive/MyDrive/textual_data/TEXTUAL_DATA/processed_disorder_symptoms.csv"
df = pd.read_csv(file_path)

# Standardize column names
df.columns = df.columns.str.lower()

# =========================
# 2. Text Cleaning Function
# =========================
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Remove numbers & special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords & lemmatize
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df["symptom"] = df["symptom"].astype(str).apply(clean_text)

# =========================
# 3. Frequency Mapping
# =========================
frequency_mapping = {
    "Very frequent (99-80%)": 4,
    "Frequent (79-30%)": 3,
    "Occasional (29-5%)": 2,
    "Very rare (<4-1%)": 1
}
df["frequency"] = df["frequency"].map(frequency_mapping)

# =========================
# 4. Drop Missing Values
# =========================
df = df.dropna(subset=["symptom", "disorder", "frequency"])

# =========================
# 5. Encode Target (Disorders)
# =========================
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["disorder"])

# =========================
# ✅ Final Preprocessed Data
# =========================
# Features:
#   - df["symptom"]    → cleaned symptom text
#   - df["frequency"]  → numeric frequency
# Target:
#   - df["label"]      → encoded disorder labels
#   - df["disorder"]   → original disorder names

print("✅ Preprocessing Complete")
print(f"Samples: {len(df)}, Unique Disorders: {df['disorder'].nunique()}")
print(df.head())
