### Step 1: Load the medical reports dataset for preprocessing


In [1]:
import pandas as pd

df = pd.read_csv('../data/medical_reports.csv')
df.head()


Unnamed: 0,report_text,diabetes,hypertension,pneumonia,asthma,arthritis
0,Experiencing joint pain and inflammation.,1,1,0,1,1
1,Reports numbness in limbs and blurred vision.,1,0,0,1,0
2,Patient reports shortness of breath and coughing.,0,1,0,0,1
3,Reports wheezing and difficulty breathing.,1,0,0,1,1
4,Complains of frequent urination and thirst.,0,1,0,1,0


### Step 2: Clean the report text by lowercasing and removing newlines


In [2]:
df['report_text'] = df['report_text'].str.lower().str.replace('\n', ' ', regex=True)
df['report_text'].iloc[0]


'experiencing joint pain and inflammation.'

### Step 3: Extract the multilabel targets for model training


In [3]:
labels = df.columns[1:]
y = df[labels]
y.head()


Unnamed: 0,diabetes,hypertension,pneumonia,asthma,arthritis
0,1,1,0,1,1
1,1,0,0,1,0
2,0,1,0,0,1
3,1,0,0,1,1
4,0,1,0,1,0


### Step 4: Convert text into numeric features using TF-IDF


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['report_text'])
X.shape


(200, 42)

### Step 5: View some feature names from TF-IDF for insight


In [5]:
vectorizer.get_feature_names_out()[:20]


array(['and', 'blood', 'blurred', 'breath', 'breathing', 'chest',
       'complains', 'coughing', 'difficulty', 'experiencing', 'fatigue',
       'fever', 'frequent', 'has', 'headaches', 'high', 'in', 'include',
       'inflammation', 'joint'], dtype=object)

### Step 6: Confirm shape of X and y for modeling
