In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess

import string
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

In [None]:
df = pd.read_csv("mtsamples.csv")

In [None]:
df.head()

In [None]:
df.head()

In [None]:
df = df[df["transcription"].notna()]

In [None]:
df.info()

In [None]:
categories = df.groupby(df["medical_specialty"])

In [None]:
i=1
for category_name, category in categories:
    print(f"Category {i}: {category_name}: {len(category)}")
    i = i+1

In [None]:
df_filtered = categories.filter(lambda x: x.shape[0] > 10)

In [None]:
ordered_specialties = df_filtered['medical_specialty'].value_counts().index

plt.figure(figsize=(10, 8))
sns.countplot(y="medical_specialty", data=df_filtered, order=ordered_specialties)
plt.title("Medical Specialty Counts (Sorted)")
plt.xlabel("Count")
plt.ylabel("Medical Specialty")
plt.show()

In [None]:
df.isna().sum()

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    doc = nlp(text)
    text = ' '.join(token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.is_alpha)
    return text

df['cleaned_transcription'] = df['transcription'].apply(clean_text)


In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt

df['medical_specialty'].value_counts().plot(kind='bar')
plt.title('Distribution of Medical Specialties')
plt.xlabel('Medical Specialty')
plt.ylabel('Count')
plt.show()


In [None]:
text_data = df['cleaned_transcription']

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf = tfidf_vectorizer.fit_transform(text_data)

tfidf_dense = tfidf.toarray()
labels = df['medical_specialty'].astype('category').cat.codes

tsne = TSNE(n_components=2, random_state=42, init='random',perplexity=30)
tsne_result = tsne.fit_transform(tfidf_dense)

In [None]:
tsne_df = pd.DataFrame(tsne_result, columns=['Dimension 1', 'Dimension 2'])
tsne_df['Medical Specialty'] = df['medical_specialty']

plt.figure(figsize=(12, 10))
sns.scatterplot(
    data=tsne_df,
    x='Dimension 1',
    y='Dimension 2',
    hue='Medical Specialty',
    palette='tab20',
    alpha=0.65,
    s=40
)

plt.title('t-SNE Visualization of Medical Transcriptions', fontsize=14)
plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)
plt.legend(title='Medical Specialties', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
top_categories = df['medical_specialty'].value_counts().head(5).index

filtered_data = df[df['medical_specialty'].isin(top_categories)]

filtered_tfidf = tfidf_vectorizer.fit_transform(filtered_data['cleaned_transcription'])
filtered_tfidf_dense = filtered_tfidf.toarray()

tsne = TSNE(n_components=2, random_state=42, init='random', perplexity=30)
filtered_tsne_result = tsne.fit_transform(filtered_tfidf_dense)

filtered_tsne_df = pd.DataFrame(filtered_tsne_result, columns=['Dimension 1', 'Dimension 2'])
filtered_tsne_df['Medical Specialty'] = filtered_data['medical_specialty'].values

plt.figure(figsize=(12, 10))
sns.scatterplot(
    data=filtered_tsne_df,
    x='Dimension 1',
    y='Dimension 2',
    hue='Medical Specialty',
    palette='Set2',
    alpha=0.7,
    s=50
)

plt.title('t-SNE Visualization of Top 5 Medical Specialties', fontsize=14)
plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)
plt.legend(title='Medical Specialties', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
top_categories = df['medical_specialty'].value_counts().head(5).index
df = df[df['medical_specialty'].isin(top_categories)]

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_transcription'])

pca = PCA(n_components=100, random_state=42)
pca_transformed = pca.fit_transform(tfidf_matrix.toarray())

labels = df['medical_specialty'].astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(pca_transformed, labels, test_size=0.2, random_state=42)

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)


conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=df['medical_specialty'].astype('category').cat.categories)
disp.plot(cmap='Blues', xticks_rotation='vertical')
plt.title("Confusion Matrix - Logistic Regression with PCA")
plt.show()

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=df['medical_specialty'].astype('category').cat.categories))


In [None]:
# Install transformers if not already installed
!pip install -q transformers

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import pandas as pd
import re
from tqdm.notebook import tqdm

# Load models and tokenizers
led_model_name = "allenai/led-base-16384"
bart_model_name = "facebook/bart-large-cnn"

led_tokenizer = AutoTokenizer.from_pretrained(led_model_name)
led_model = AutoModelForSeq2SeqLM.from_pretrained(led_model_name).to("cuda")

bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to("cuda")

def clean_text(text):
    text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Load the dataset again in case it's modified earlier in the notebook
df = pd.read_csv("mtsamples.csv")
df = df.dropna(subset=["transcription"])
df_sample = df.head(200).copy()

# Summarization with fallback

def summarize_text(text, max_input_length=4096, led_max_tokens=512, bart_max_tokens=256):
    text = clean_text(text)
    try:
        # Tokenize for LED
        inputs = led_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_length).to("cuda")
        global_attention_mask = torch.zeros_like(inputs["input_ids"])
        global_attention_mask[:, 0] = 1
        summary_ids = led_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            global_attention_mask=global_attention_mask,
            max_length=led_max_tokens,
            no_repeat_ngram_size=3,
            num_beams=4,
            early_stopping=True
        )
        return led_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    except Exception as e:
        print(f"LED summarization failed: {e}")
        try:
            inputs = bart_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
            summary_ids = bart_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=bart_max_tokens,
                no_repeat_ngram_size=3,
                num_beams=4,
                early_stopping=True
            )
            return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        except Exception as bart_error:
            return f"[Summary failed: {bart_error}]"

# Apply summarization with progress
tqdm.pandas()
df_sample["summary"] = df_sample["transcription"].progress_apply(summarize_text)

# Preview summarized results
df_sample[["medical_specialty", "summary"]].head()


In [None]:
!pip install -q rouge-score

In [None]:
from rouge_score import rouge_scorer
import numpy as np

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_summaries(df, original_col='transcription', summary_col='summary', n_samples=50):
    sampled_df = df.sample(n=n_samples, random_state=42)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for _, row in sampled_df.iterrows():
        reference = clean_text(row[original_col])
        candidate = clean_text(row[summary_col])
        try:
            result = scorer.score(reference, candidate)
            for key in scores:
                scores[key].append(result[key].fmeasure)
        except:
            continue

    # Calculate average scores
    avg_scores = {key: np.mean(values) for key, values in scores.items()}
    return avg_scores

# Evaluate and print
summary_scores = evaluate_summaries(df_sample, n_samples=50)
print("ROUGE Evaluation Metrics (Average F1 Scores):")
for k, v in summary_scores.items():
    print(f"{k.upper()}: {v:.4f}")

In [None]:
from IPython.display import display, HTML

def compare_transcripts(df, num_samples=2):
    html = ""
    for idx, row in df.sample(n=num_samples).iterrows():
        html += f"""
        <div style="border:1px solid #444; border-radius:10px; padding:15px; margin:10px 0; background-color:#111; color:white;">
            <h4 style="margin-bottom:5px;">Medical Specialty: <span style="color:#2b7a78;">{row['medical_specialty']}</span></h4>
            <p><strong>Description:</strong> {row['description']}</p>
            <div style="display:flex; gap:20px;">
                <div style="flex:1;">
                    <h5 style="margin-bottom:5px;">🔹 Original Transcription</h5>
                    <div style="background:#222; color:white; padding:10px; border-radius:5px; max-height:300px; overflow:auto; border:1px solid #555;">
                        <pre style="white-space:pre-wrap; color:white;">{row['transcription']}</pre>
                    </div>
                </div>
                <div style="flex:1;">
                    <h5 style="margin-bottom:5px;">🔸 Summarized Version</h5>
                    <div style="background:#222; color:white; padding:10px; border-radius:5px; max-height:300px; overflow:auto; border:1px solid #555;">
                        <pre style="white-space:pre-wrap; color:white;">{row['summary']}</pre>
                    </div>
                </div>
            </div>
        </div>
        """
    display(HTML(html))

compare_transcripts(df_sample, num_samples=2)