In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import os
import base64
from io import BytesIO

label_mapping = {
    "Congenital & Genetic": 0,
    "Trauma & Physical Injuries": 1,
    "Vascular & Circulatory": 2,
    "Infections": 3,
    "Neoplasm - Benign & Sarcoma": 4,
    "Neoplasm - Carcinoma": 5,
    "Neoplasm - Other Malignant": 6,
    "Inflammatory & Autoimmune": 7,
    "Metabolic & Endocrine": 8,
    "Cysts & Degenerative Conditions": 9,
    "Obstruction & Structural Abnormalities": 10,
    "Miscellaneous Conditions": 11
}

label_names = {v: k for k, v in label_mapping.items()}

In [None]:
from sklearn.preprocessing import label_binarize

class MedicalTextClassifier:
    def __init__(self, json_file):
        self.json_file = json_file
        self.vectorizer = TfidfVectorizer(
            max_features=1000,
            ngram_range=(1, 1),
            stop_words='english',
            min_df=5,
            max_df=0.80
        )
        #self.model =  RandomForestClassifier(
         #     n_estimators=100,
          #    max_depth=15,
           #   min_samples_split=5,
            #  random_state=5

        )

        self.model= LogisticRegression(C=1,max_iter=1500, random_state=42)

    def load_and_preprocess_data(self):
        with open(self.json_file, 'r') as f:
            data = json.load(f)

        texts = []
        labels = []

        for record in data:
            label_str = record['Class/Label'].strip().lower()
            if label_str == "miscellaneous conditions":
              continue
            text = f"{record['Case'].get('Title','')} {record['Case'].get('History','')} {record['Case'].get('Differential Diagnosis','')} {record.get('Location','')} {record['Case'].get('Findings','')}"
            texts.append(text)
            labels.append(label_mapping[record['Class/Label']])

        return texts, np.array(labels)

    def prepare_data(self):
        texts, labels = self.load_and_preprocess_data()

        X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, test_size=0.2, random_state=42
        )

        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        X_test_tfidf = self.vectorizer.transform(X_test)

        return X_train_tfidf, X_test_tfidf, y_train, y_test

    def train(self):
        X_train_tfidf, X_test_tfidf, y_train, y_test = self.prepare_data()

        self.model.fit(X_train_tfidf, y_train)

        y_train_pred = self.model.predict(X_train_tfidf)
        y_test_pred = self.model.predict(X_test_tfidf)
        y_test_pred_proba = self.model.predict_proba(X_test_tfidf)

        return X_train_tfidf, X_test_tfidf, y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba

    def evaluate(self, y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba):
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)

        train_precision = precision_score(y_train, y_train_pred, average='weighted')
        test_precision = precision_score(y_test, y_test_pred, average='weighted')

        train_recall = recall_score(y_train, y_train_pred, average='weighted')
        test_recall = recall_score(y_test, y_test_pred, average='weighted')

        train_f1 = f1_score(y_train, y_train_pred, average='weighted')
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')

        print("\nTraining Metrics:")
        print(f"Accuracy: {train_accuracy:.4f}")
        print(f"Precision: {train_precision:.4f}")
        print(f"Recall: {train_recall:.4f}")
        print(f"F1-Score: {train_f1:.4f}")

        print("\nTest Metrics:")
        print(f"Accuracy: {test_accuracy:.4f}")
        print(f"Precision: {test_precision:.4f}")
        print(f"Recall: {test_recall:.4f}")
        print(f"F1-Score: {test_f1:.4f}")

        print("\nDetailed Classification Report:")
        print(classification_report(y_test, y_test_pred, target_names=[label_names[i] for i in range(len(label_names))]))
        n_classes = len(label_mapping)
        y_test_bin = label_binarize(y_test, classes=list(range(n_classes)))

        macro_auc = np.mean([
            auc(*roc_curve(y_test_bin[:, i], y_test_pred_proba[:, i])[:2])
            for i in range(n_classes)
        ])

        print(f"\nMacro-Averaged AUC-ROC: {macro_auc:.4f}")

        cm = confusion_matrix(y_test, y_test_pred)

        os.makedirs('output', exist_ok=True)

        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=[label_names[i] for i in range(len(label_names))],
                    yticklabels=[label_names[i] for i in range(len(label_names))])
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()

        confusion_matrix_path = os.path.join('output', 'confusion_matrix.png')
        plt.savefig(confusion_matrix_path, dpi=300, bbox_inches='tight')
        print("\nConfusion Matrix:")
        self._display_image(plt)
        plt.close()

        n_classes = len(label_mapping)
        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        plt.figure(figsize=(10, 8))
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test == i, y_test_pred_proba[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
            plt.plot(fpr[i], tpr[i], label=f'{label_names[i]} (AUC = {roc_auc[i]:.2f})')

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for All Classes')
        plt.legend(loc="lower right", bbox_to_anchor=(1.15, 0))
        plt.tight_layout()

        roc_curves_path = os.path.join('output', 'roc_curves.png')
        plt.savefig(roc_curves_path, dpi=300, bbox_inches='tight')
        print("\nROC Curves:")
        self._display_image(plt)
        plt.close()

    def _display_image(self, plt):
        """Helper method to display matplotlib figure in the output"""
        buf = BytesIO()
        plt.savefig(buf, format='png', dpi=300, bbox_inches='tight')
        buf.seek(0)

        img_str = base64.b64encode(buf.read()).decode('utf-8')

        print(f'<img src="data:image/png;base64,{img_str}" width="800">')

In [None]:
json_file = "/content/drive/MyDrive/GP2/3_After_Text_Augmentation/Text_Augmented.json"

def main():
    classifier = MedicalTextClassifier(json_file)
    X_train_tfidf, X_test_tfidf, y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba = classifier.train()
    classifier.evaluate(y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba)

if __name__ == "__main__":
    main()


Training Metrics:
Accuracy: 0.9937
Precision: 0.9937
Recall: 0.9937
F1-Score: 0.9937

Test Metrics:
Accuracy: 0.9522
Precision: 0.9502
Recall: 0.9522
F1-Score: 0.9498

Detailed Classification Report:
                                        precision    recall  f1-score   support

                  Congenital & Genetic       0.79      0.63      0.70        35
            Trauma & Physical Injuries       0.95      1.00      0.97        38
                Vascular & Circulatory       0.98      1.00      0.99        41
                            Infections       1.00      0.90      0.95        30
           Neoplasm - Benign & Sarcoma       0.94      1.00      0.97        46
                  Neoplasm - Carcinoma       0.97      1.00      0.99        39
            Neoplasm - Other Malignant       0.96      0.90      0.93        50
             Inflammatory & Autoimmune       0.91      1.00      0.95        40
                 Metabolic & Endocrine       1.00      1.00      1.00        3

In [None]:
json_file = "/content/drive/MyDrive/GP2/5_After_1-Image_Augmentation/Image_1_Augmented.json"

def main():
    classifier = MedicalTextClassifier(json_file)
    X_train_tfidf, X_test_tfidf, y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba = classifier.train()
    classifier.evaluate(y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba)

if __name__ == "__main__":
    main()


Training Metrics:
Accuracy: 0.9744
Precision: 0.9751
Recall: 0.9744
F1-Score: 0.9734

Test Metrics:
Accuracy: 0.9546
Precision: 0.9558
Recall: 0.9546
F1-Score: 0.9508

Detailed Classification Report:
                                        precision    recall  f1-score   support

                  Congenital & Genetic       0.98      0.90      0.94        70
            Trauma & Physical Injuries       0.93      1.00      0.96        88
                Vascular & Circulatory       0.96      1.00      0.98        82
                            Infections       0.97      1.00      0.98        60
           Neoplasm - Benign & Sarcoma       0.98      0.98      0.98       101
                  Neoplasm - Carcinoma       0.91      1.00      0.95        84
            Neoplasm - Other Malignant       0.90      0.92      0.91        85
             Inflammatory & Autoimmune       0.97      1.00      0.99       102
                 Metabolic & Endocrine       1.00      1.00      1.00        6

In [None]:
json_file = "/content/drive/MyDrive/GP2/6_After_2-Image_Augmentation/Image_2_Augmented.json"

def main():
    classifier = MedicalTextClassifier(json_file)
    X_train_tfidf, X_test_tfidf, y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba = classifier.train()
    classifier.evaluate(y_train, y_test, y_train_pred, y_test_pred, y_test_pred_proba)

if __name__ == "__main__":
    main()


Training Metrics:
Accuracy: 0.9822
Precision: 0.9825
Recall: 0.9822
F1-Score: 0.9815

Test Metrics:
Accuracy: 0.9592
Precision: 0.9603
Recall: 0.9592
F1-Score: 0.9550

Detailed Classification Report:
                                        precision    recall  f1-score   support

                  Congenital & Genetic       0.92      0.96      0.94        97
            Trauma & Physical Injuries       0.96      1.00      0.98       153
                Vascular & Circulatory       0.95      1.00      0.97       115
                            Infections       0.94      1.00      0.97        95
           Neoplasm - Benign & Sarcoma       0.96      1.00      0.98       140
                  Neoplasm - Carcinoma       0.96      1.00      0.98       111
            Neoplasm - Other Malignant       0.92      0.95      0.93       120
             Inflammatory & Autoimmune       0.99      1.00      0.99       140
                 Metabolic & Endocrine       1.00      1.00      1.00       13