In [12]:
!pip install datasets pandas scikit-learn gradio transformers




In [13]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("dmacres/mimiciii-hospitalcourse-meta")

In [3]:
#code below

In [11]:
# Import necessary libraries
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score
import re
import gradio as gr

# Function to extract doctor types from text
def extract_doctor_types(text):
    # Define a list of doctor types to search for
    doctor_types = [
        "neurosurgeon", "cardiologist", "hematologist", "oncologist",
        "radiologist", "endocrinologist", "gastroenterologist", "urologist",
        "orthopedic", "dermatologist", "psychiatrist", "pulmonologist",
        "neurologist", "ophthalmologist", "ENT", "rheumatologist"
    ]
    found_doctors = [doc for doc in doctor_types if re.search(doc, text, re.IGNORECASE)]
    return found_doctors

# Load the dataset
dataset = load_dataset("dmacres/mimiciii-hospitalcourse-meta")
# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Extract doctor types from the target text
df['doctor_types'] = df['target_text'].apply(extract_doctor_types)

# Filter out rows without any doctor types identified
df = df[df['doctor_types'].apply(len) > 0]

# Define features (X) and labels (y)
X = df['extractive_notes_summ']
y = df['doctor_types']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Binarize the target labels for multi-label classification
mlb = MultiLabelBinarizer()
y_train_bin = mlb.fit_transform(y_train)
y_test_bin = mlb.transform(y_test)

# Train a OneVsRest Logistic Regression model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_vec, y_train_bin)

# Make predictions on the test set
y_pred_bin = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test_bin, y_pred_bin)
f1 = f1_score(y_test_bin, y_pred_bin, average='micro')
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

# Function for Gradio to make predictions on user input
def predict_doctor_types(text):
    text_vec = vectorizer.transform([text])
    pred_bin = model.predict(text_vec)
    pred_labels = mlb.inverse_transform(pred_bin)
    return pred_labels[0] if pred_labels else []

# Create a Gradio interface
iface = gr.Interface(
    fn=predict_doctor_types,
    inputs=gr.Textbox(lines=5, placeholder="Enter medical note here..."),
    outputs=gr.Textbox(label="Predicted Doctor Types"),
    title="Doctor Type Prediction from Medical Notes",
    description="Enter medical notes to predict the types of doctors involved."
)

# Launch the Gradio interface
iface.launch()


Accuracy: 87.90%
F1 Score: 93.44%
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://2a190e4db1d74852c8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [6]:
from collections import Counter

# Check the distribution of doctor types in the dataset
doctor_type_counts = Counter([doc for sublist in df['doctor_types'] for doc in sublist])
print(doctor_type_counts)


Counter({'ENT': 23539, 'cardiologist': 890, 'orthopedic': 791, 'oncologist': 392, 'urologist': 254, 'neurologist': 185, 'psychiatrist': 180, 'pulmonologist': 151, 'gastroenterologist': 104, 'radiologist': 85, 'hematologist': 75, 'endocrinologist': 74, 'neurosurgeon': 37, 'ophthalmologist': 28, 'rheumatologist': 24, 'dermatologist': 12})


In [16]:
# Import necessary libraries
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
import re
import string
import gradio as gr
from collections import Counter
from imblearn.over_sampling import SMOTE
from scipy.sparse import vstack
import numpy as np

# Function to extract doctor types from text
def extract_doctor_types(text):
    doctor_types = [
        "neurosurgeon", "cardiologist", "hematologist", "oncologist",
        "radiologist", "endocrinologist", "gastroenterologist", "urologist",
        "orthopedic", "dermatologist", "psychiatrist", "pulmonologist",
        "neurologist", "ophthalmologist", "ENT", "rheumatologist"
    ]
    found_doctors = [doc for doc in doctor_types if re.search(doc, text, re.IGNORECASE)]
    return found_doctors

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'\d+', '', text)
    return text

# Load the dataset
dataset = load_dataset("dmacres/mimiciii-hospitalcourse-meta")
df = pd.DataFrame(dataset['train'])

# Extract doctor types from the target text
df['doctor_types'] = df['target_text'].apply(extract_doctor_types)

# Filter out rows without any doctor types identified
df = df[df['doctor_types'].apply(len) > 0]

# Preprocess the text data
df['extractive_notes_summ'] = df['extractive_notes_summ'].apply(preprocess_text)

# Define features (X) and labels (y)
X = df['extractive_notes_summ']
y = df['doctor_types']

# Check for class imbalance
doctor_type_counts = Counter([doc for sublist in df['doctor_types'] for doc in sublist])
print(doctor_type_counts)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Binarize the target labels for multi-label classification
mlb = MultiLabelBinarizer()
y_train_bin = mlb.fit_transform(y_train)
y_test_bin = mlb.transform(y_test)

# Apply SMOTE to each label individually and stack results
smote = SMOTE()

# Initialize empty arrays for resampled data
X_resampled = None
y_resampled = None

for i in range(y_train_bin.shape[1]):
    X_res, y_res = smote.fit_resample(X_train_vec, y_train_bin[:, i].reshape(-1, 1))
    if X_resampled is None:
        X_resampled = X_res
        y_resampled = y_res
    else:
        X_resampled = vstack((X_resampled, X_res))
        y_resampled = np.hstack((y_resampled, y_res))

# Train a OneVsRest Logistic Regression model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
model.fit(X_resampled, y_resampled)

# Function for Gradio to make predictions on user input
def predict_doctor_types(text):
    text = preprocess_text(text)
    text_vec = vectorizer.transform([text])
    pred_bin = model.predict(text_vec)
    pred_labels = mlb.inverse_transform(pred_bin)
    return pred_labels[0] if pred_labels else []

# Create a Gradio interface
iface = gr.Interface(
    fn=predict_doctor_types,
    inputs=gr.Textbox(lines=5, placeholder="Enter medical note here..."),
    outputs=gr.Textbox(label="Predicted Doctor Types"),
    title="Doctor Type Prediction from Medical Notes",
    description="Enter medical notes to predict the types of doctors involved."
)

# Launch the Gradio interface
iface.launch()


Counter({'ENT': 23539, 'cardiologist': 890, 'orthopedic': 791, 'oncologist': 392, 'urologist': 254, 'neurologist': 185, 'psychiatrist': 180, 'pulmonologist': 151, 'gastroenterologist': 104, 'radiologist': 85, 'hematologist': 75, 'endocrinologist': 74, 'neurosurgeon': 37, 'ophthalmologist': 28, 'rheumatologist': 24, 'dermatologist': 12})
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://9b4405c7d132d5effa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [6]:
pip install smote

Collecting smote
  Downloading smote-0.1-py2.py3-none-any.whl (3.3 kB)
Installing collected packages: smote
Successfully installed smote-0.1


In [24]:
# Assuming your model setup and data loading are correct up to this point
# Predict probabilities (if using OneVsRestClassifier with LogisticRegression)
y_pred_prob = model.predict_proba(X_test_vec)

# Check the shape and content of y_pred_prob
print(f"Shape of y_pred_prob: {y_pred_prob.shape}")
print(f"Example of y_pred_prob: {y_pred_prob[:5]}")

# Assuming y_pred_prob has shape (n_samples, n_classes)
# Print the unique values in y_pred_prob to check the range of probabilities
print(f"Unique values in y_pred_prob: {np.unique(y_pred_prob)}")

# Troubleshoot why y_pred_prob has only 2 columns instead of 16
# Ensure that your model setup and data preprocessing correctly handle all 16 classes

# If y_pred_prob has shape (4712, 2), debug further:
# - Check model configuration (OneVsRestClassifier with LogisticRegression)
# - Verify data preprocessing (encoding of target labels)

# Once y_pred_prob is correctly predicting probabilities for 16 classes,
# proceed with thresholding and evaluation as previously discussed.


Shape of y_pred_prob: (4712, 2)
Example of y_pred_prob: [[0.89412235 0.10587765]
 [0.93387798 0.06612202]
 [0.50207605 0.49792395]
 [0.61418695 0.38581305]
 [0.92929515 0.07070485]]
Unique values in y_pred_prob: [2.34455340e-04 3.53626396e-04 5.70811099e-04 ... 9.99429189e-01
 9.99646374e-01 9.99765545e-01]


\

In [25]:
#code 3

In [26]:
# Import necessary libraries
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
import re
import string
import gradio as gr
from collections import Counter
from imblearn.over_sampling import SMOTE
from scipy.sparse import vstack

# Function to extract doctor types from text
def extract_doctor_types(text):
    doctor_types = [
        "neurosurgeon", "cardiologist", "hematologist", "oncologist",
        "radiologist", "endocrinologist", "gastroenterologist", "urologist",
        "orthopedic", "dermatologist", "psychiatrist", "pulmonologist",
        "neurologist", "ophthalmologist", "ENT", "rheumatologist"
    ]
    found_doctors = [doc for doc in doctor_types if re.search(doc, text, re.IGNORECASE)]
    return found_doctors

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'\d+', '', text)
    return text

# Load the dataset
dataset = load_dataset("dmacres/mimiciii-hospitalcourse-meta")
df = pd.DataFrame(dataset['train'])

# Extract doctor types from the target text
df['doctor_types'] = df['target_text'].apply(extract_doctor_types)

# Filter out rows without any doctor types identified
df = df[df['doctor_types'].apply(len) > 0]

# Preprocess the text data
df['extractive_notes_summ'] = df['extractive_notes_summ'].apply(preprocess_text)

# Define features (X) and labels (y)
X = df['extractive_notes_summ']
y = df['doctor_types']

# Check for class imbalance
doctor_type_counts = Counter([doc for sublist in df['doctor_types'] for doc in sublist])
print(doctor_type_counts)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Binarize the target labels for multi-label classification
mlb = MultiLabelBinarizer()
y_train_bin = mlb.fit_transform(y_train)
y_test_bin = mlb.transform(y_test)

# Apply SMOTE to handle class imbalance (optional)
# SMOTE is typically applied to the entire multi-label set, not each label individually
# smote = SMOTE()
# X_train_res, y_train_res = smote.fit_resample(X_train_vec, y_train_bin)

# Train a OneVsRest Logistic Regression model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
model.fit(X_train_vec, y_train_bin)

# Function for Gradio to make predictions on user input
def predict_doctor_types(text):
    text = preprocess_text(text)
    text_vec = vectorizer.transform([text])
    pred_bin = model.predict(text_vec)
    pred_labels = mlb.inverse_transform(pred_bin)
    return pred_labels[0] if pred_labels else []

# Create a Gradio interface
iface = gr.Interface(
    fn=predict_doctor_types,
    inputs=gr.Textbox(lines=5, placeholder="Enter medical note here..."),
    outputs=gr.Textbox(label="Predicted Doctor Types"),
    title="Doctor Type Prediction from Medical Notes",
    description="Enter medical notes to predict the types of doctors involved."
)

# Launch the Gradio interface
iface.launch()


Counter({'ENT': 23539, 'cardiologist': 890, 'orthopedic': 791, 'oncologist': 392, 'urologist': 254, 'neurologist': 185, 'psychiatrist': 180, 'pulmonologist': 151, 'gastroenterologist': 104, 'radiologist': 85, 'hematologist': 75, 'endocrinologist': 74, 'neurosurgeon': 37, 'ophthalmologist': 28, 'rheumatologist': 24, 'dermatologist': 12})
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://adceb9bd474a1a6f8f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


