# Knee X-ray Analysis Using ResNet for Osteoarthritis Severity Assessment

#  miRNA processing 

In [None]:
!pip install -q imbalanced-learn
!pip install -q openpyxl


In [None]:



# Step 1: Import Libraries (Kaggle already has imbalanced-learn, no need to install)
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Step 2: Load Dataset
# Check available files in dataset
print("Files in dataset directory:", os.listdir("/kaggle/input/mirna-dataset"))

# Update the file name based on the dataset
file_path = "/kaggle/input/mirna-dataset/mRNA.xlsx"  # Replace with the actual filename
data = pd.read_excel(file_path)

# Step 3: Preprocess Dataset
data_transposed = data.set_index('miRNA').T
data_transposed.reset_index(inplace=True)

# Add mock 'Progression_Status' (binary: 0 = non-progressor, 1 = progressor) for demonstration
data_transposed['Progression_Status'] = np.random.choice([0, 1], size=len(data_transposed))

# Select relevant features (excluding 'Age')
selected_features = ['hsa-miR-556-3p', 'hsa-miR-3157-5p', 'hsa-miR-200a-5p', 'hsa-miR-141-3p']
target = 'Progression_Status'

# Filter dataset
filtered_data = data_transposed[selected_features + [target]].dropna()

X = filtered_data[selected_features]
y = filtered_data[target]

# Step 4: Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 5: Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

rf_model = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(rf_model, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

# Step 6: Evaluate the Best Model
best_model = grid_search.best_estimator_
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print("Best Parameters:", grid_search.best_params_)
print(f"Improved Accuracy: {accuracy:.2f}")
print(f"Improved AUC: {auc:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Visualize Feature Importance
feature_importances = best_model.feature_importances_
plt.barh(selected_features, feature_importances)
plt.xlabel('Feature Importance')
plt.title('Improved Random Forest Feature Importance')

# Save Feature Importance Plot
feature_importance_path = "/kaggle/working/feature_importance.png"
plt.savefig(feature_importance_path)
plt.show()
print(f"Feature importance plot saved at: {feature_importance_path}")

# Step 8: Save miRNA Model Predictions
miRNA_probs = best_model.predict_proba(X)[:, 1]  # Probability of class 1
miRNA_pred_df = pd.DataFrame({'miRNA_pred': miRNA_probs})

# Save predictions to CSV
csv_path = "/kaggle/working/miRNA_predictions.csv"
miRNA_pred_df.to_csv(csv_path, index=False)

print(f"miRNA predictions saved successfully at {csv_path}!")


# X-Ray detection model

**By: Bernard Adhitya Kurniawan**

This is a quick demonstration on how to use a Convolutional Neural Network (CNN), specifically a pre-trained ResNet model, to analyze knee X-ray images and assess the severity of osteoarthritis using the Osteoarthritis Initiative (OAI) dataset.

## Step 1: Import Libraries
First, we need to import the necessary libraries for data handling, model building, training, and evaluation.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models, utils

from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from tqdm import tqdm

## Step 2: Load and Prepare the Dataset
We will create the function `load_dataset_as_dataframe()` to load the OAI dataset from the subdirectories prepared for us; `train/` and `test/`, and organize the image paths and labels.

- We traverse through each class folder (representing osteoarthritis severity grades; `['0', '1', '2', '3', '4']`) and collect image paths and corresponding labels.
- The classes list contains all the class names, and class_to_idx maps these class names to numeric labels.
- We create a DataFrame data for easier data manipulation.

In [None]:
def load_dataset_as_dataframe(subdir):
    # Define the path to the dataset
    data_dir = f'/kaggle/input/knee-osteoarthritis-dataset-with-severity/{subdir}'
    print(f'Load dataset from `{subdir}` subdirectory')

    # Create lists to store image paths and labels
    image_paths = []
    labels = []

    # Get the list of class directories
    classes = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    classes.sort()
    print('Classes:', classes)

    # Map class names to labels
    class_to_idx = {class_name: idx for idx, class_name in enumerate(classes)}

    # Loop over each class directory
    for class_name in classes:
        class_dir = os.path.join(data_dir, class_name)
        label = class_to_idx[class_name]
        # Get all image files in the class directory
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            if os.path.isfile(img_path):
                image_paths.append(img_path)
                labels.append(label)

    # Create a DataFrame
    data = pd.DataFrame({
        'image_path': image_paths,
        'label': labels
    })
    
    # Show the distribution of labels in the dataset
    dataset_distribution_dict = {}
    for i in range(5):
        dataset_distribution_dict[i] = len(data[data['label'] == i])
    print(dataset_distribution_dict)
    print()

    return data

## Step 3: Split the Dataset
Using the above function `load_dataset_as_dataframe()`, we'll load the test and train dataset as dataframe

In [None]:
# Prepare training and test dataset as dataframes
classes = ['0', '1', '2', '3', '4']
train_df = load_dataset_as_dataframe('train')
test_df = load_dataset_as_dataframe('test')

## Step 4: Define Image Transformations
We define transformations for data augmentation and normalization.

* Images are resized to 224x224 pixels to match the input size expected by ResNet.
* Data augmentation is applied to the training set with random horizontal flips.
* Images are normalized using ImageNet mean and standard deviation values.

In [None]:
# Define image transformations
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet standards
                         std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

## Step 5: Create Custom Dataset Class
We create a custom dataset class to handle image loading and preprocessing.

* The `KneeDataset` class inherits from torch.utils.data.Dataset.
* The `__getitem__` method loads and returns an image and its label.

In [None]:
# Create custom dataset class
class KneeDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.loc[idx, 'image_path']
        label = self.df.loc[idx, 'label']
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

## Step 6: Create DataLoaders
We create DataLoader objects for batching and shuffling the data.

* `batch_size=32` specifies the number of samples per batch.
* `shuffle=True` randomizes the order of data every epoch in the training set.

In [None]:
# Create dataset instances
train_dataset = KneeDataset(train_df, transform=train_transform)
test_dataset = KneeDataset(test_df, transform=test_transform)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

## Step 7: Visualize Sample Images
We can visualize a batch of training images to verify the data loading and transformations.

* We define an `imshow` function to display images after unnormalizing them.
* We use `utils.make_grid` to create a grid of images.

In [None]:
# Function to display images
def imshow(img):
    img = img.numpy().transpose((1, 2, 0))
    # Unnormalize
    mean = np.array([0.485, 0.456, 0.406])
    std  = np.array([0.229, 0.224, 0.225])
    img  = std * img + mean
    img  = np.clip(img, 0, 1)
    plt.imshow(img)
    plt.axis('off')

# Get a batch of training data
images, labels = next(iter(train_loader))

# Make a grid from batch
out = utils.make_grid(images)

# Display images
plt.figure(figsize=(10, 10))
imshow(out)
plt.show()

## Step 8: Load and Modify the Pre-trained ResNet Model
We load a pre-trained ResNet18 model and modify the final layer to match our number of classes.

* We replace the final fully connected layer (`model.fc`) to output the correct number of classes for our dataset.
* `pretrained=True` loads weights trained on ImageNet.

In [None]:
import torch
import torchvision.models as models

# Load the model
model = models.resnet18()

# Load manually downloaded weights
weights_path = "/kaggle/input/resnet/pytorch/default/1/resnet18-f37072fd.pth"  # Adjust path accordingly
model.load_state_dict(torch.load(weights_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))

# Set the model to evaluation mode
model.eval()

# Modify the final layer
num_ftrs = model.fc.in_features
num_classes = len(classes)
model.fc = nn.Linear(num_ftrs, num_classes)

## Step 9: Define Loss Function and Optimizer
We set up the loss function and optimizer for training.

* `CrossEntropyLoss` is suitable for multi-class classification.
* `Adam` optimizer is used with a learning rate of 0.001.

In [None]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Step 10: Train the Model
We train the model for a few epochs.

* We iterate over the training data, compute the loss, perform backpropagation, and update the model weights.
* `model.train()` sets the model to training mode.

In [None]:
# Number of epochs
num_epochs = 25

# Training loop with progress bars
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    total_correct = 0
    total_samples = 0
    
    # Wrap the train_loader with tqdm for a progress bar
    pbar = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]")
    for images, labels in pbar:
        images = images.to(device)
        labels = labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item() * images.size(0)
        
        # Calculate accuracy within the batch
        _, preds = torch.max(outputs, 1)
        total_correct += torch.sum(preds == labels.data)
        total_samples += labels.size(0)
        
        # Update progress bar description
        pbar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{(total_correct/total_samples*100):.2f}%'
        })
    
    epoch_loss = running_loss / len(train_dataset)
    epoch_acc = total_correct.double() / len(train_dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

## Step 11: Evaluate the Model
We test the model on the test set and evaluate its performance.

* `model.eval()` sets the model to evaluation mode.
* We disable gradient computation with `torch.no_grad()`.
* We collect all predictions and true labels to compute accuracy and generate reports.
* The confusion matrix and classification report provide detailed insights into model performance.

In [None]:
# Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy on test set: {accuracy:.4f}')

# Classification report
print('Classification Report:')
print(classification_report(all_labels, all_preds, target_names=classes))

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
print('Confusion Matrix:')
print(cm)

# Plot confusion matrix
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=classes, yticklabels=classes)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# **Model Saving**  

In [None]:
# Define path to save the model
save_path = "best_model_knee.pth"

# Save the entire model
torch.save(model.state_dict(), save_path)

print(f"Model saved successfully at {save_path}")


**NEW DATASET SAVE**

In [None]:
import torch
import pandas as pd

# Load trained model
model.load_state_dict(torch.load("best_model_knee.pth", map_location=device))
model.eval()

# Store predictions
xray_preds = []
xray_filenames = []

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        outputs = model(images)
        probs = torch.nn.functional.softmax(outputs, dim=1)  # Get probability scores
        xray_preds.extend(probs[:, 1].cpu().numpy())  # Probability of class 1

# Create DataFrame for predictions
xray_pred_df = pd.DataFrame({'Xray_pred': xray_preds})

# Save predictions to CSV
xray_pred_df.to_csv("Xray_predictions.csv", index=False)

print("X-ray predictions saved successfully!")


# fusion model

step-1

In [None]:
import pandas as pd
import numpy as np

# Define file paths
miRNA_pred_path = "/kaggle/working/miRNA_predictions.csv"
xray_pred_path = "/kaggle/working/Xray_predictions.csv"
fusion_output_path = "/kaggle/working/fusion_dataset.csv"

# Load both prediction files
miRNA_pred_df = pd.read_csv(miRNA_pred_path)
xray_pred_df = pd.read_csv(xray_pred_path)

# Ensure both have the same length (handling missing values if needed)
min_len = min(len(miRNA_pred_df), len(xray_pred_df))
miRNA_pred_df = miRNA_pred_df.iloc[:min_len]
xray_pred_df = xray_pred_df.iloc[:min_len]

# Combine into one DataFrame
fusion_data = pd.concat([miRNA_pred_df, xray_pred_df], axis=1)

# Add labels (assuming available)
# Replace `y_labels` with actual labels if they exist
fusion_data['label'] = np.random.choice([0, 1], size=len(fusion_data))  # Mock labels

# Save the combined dataset
fusion_data.to_csv(fusion_output_path, index=False)

print(f"Fusion dataset created successfully at {fusion_output_path}!")


step-2

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Define file path for the fusion dataset
fusion_data_path = "/kaggle/working/fusion_dataset.csv"

# Load the fusion dataset
fusion_data = pd.read_csv(fusion_data_path)

# Define features and target
X_fusion = fusion_data[['miRNA_pred', 'Xray_pred']]
y_fusion = fusion_data['label']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_fusion, y_fusion, test_size=0.2, random_state=42)

# Train a simple Random Forest model
fusion_model = RandomForestClassifier(n_estimators=100, random_state=42)
fusion_model.fit(X_train, y_train)

# Evaluate on test set
y_pred = fusion_model.predict(X_test)
y_prob = fusion_model.predict_proba(X_test)[:, 1]

# Print results
print("Fusion Model Accuracy:", accuracy_score(y_test, y_pred))
print("Fusion Model AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))


# ANALYTICS

# COMPARE

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support, classification_report
import pandas as pd

# Define evaluation function
def evaluate_model(y_true, y_pred, y_prob=None, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None

    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    if auc:
        print(f"ROC-AUC: {auc:.4f}")

    print("\nClassification Report:\n", classification_report(y_true, y_pred))

    return accuracy, precision, recall, f1, auc

# Paths to predictions stored in Kaggle's working directory
miRNA_pred_path = "/kaggle/working/miRNA_predictions.csv"
xray_pred_path = "/kaggle/working/Xray_predictions.csv"
fusion_data_path = "/kaggle/working/fusion_dataset.csv"

# Load predictions
miRNA_pred_df = pd.read_csv(miRNA_pred_path)
xray_pred_df = pd.read_csv(xray_pred_path)
fusion_data = pd.read_csv(fusion_data_path)

# Extract labels and predictions
y_test_miRNA = fusion_data['label']
y_pred_miRNA = miRNA_pred_df['miRNA_pred'].round().astype(int)  # Convert probabilities to binary predictions
y_prob_miRNA = miRNA_pred_df['miRNA_pred']  # Probabilities

# Ensure X-ray predictions match test samples
y_test_xray = fusion_data['label'][:len(xray_pred_df)]
y_pred_xray = xray_pred_df['Xray_pred'].round().astype(int)[:len(y_test_xray)]
y_prob_xray = xray_pred_df['Xray_pred'][:len(y_test_xray)]

y_test_fusion = fusion_data['label']
fusion_model_preds = fusion_data[['miRNA_pred', 'Xray_pred']]  # Features for the fusion model

# Evaluate miRNA Model
miRNA_accuracy, miRNA_precision, miRNA_recall, miRNA_f1, miRNA_auc = evaluate_model(
    y_test_miRNA, y_pred_miRNA, y_prob_miRNA, "miRNA Model"
)

# Evaluate X-ray Model
xray_accuracy, xray_precision, xray_recall, xray_f1, xray_auc = evaluate_model(
    y_test_xray, y_pred_xray, y_prob_xray, "X-ray Model"
)

# Evaluate Fusion Model
fusion_accuracy, fusion_precision, fusion_recall, fusion_f1, fusion_auc = evaluate_model(
    y_test_fusion, fusion_model.predict(fusion_model_preds), fusion_model.predict_proba(fusion_model_preds)[:, 1], "Fusion Model"
)


In [None]:
import pandas as pd

# Create a DataFrame with performance metrics
results_df = pd.DataFrame({
    "Model": ["miRNA", "X-ray", "Fusion"],
    "Accuracy": [miRNA_accuracy, xray_accuracy, fusion_accuracy],
    "Precision": [miRNA_precision, xray_precision, fusion_precision],
    "Recall": [miRNA_recall, xray_recall, fusion_recall],
    "F1-score": [miRNA_f1, xray_f1, fusion_f1],
    "ROC-AUC": [miRNA_auc, xray_auc, fusion_auc]
})

# Define the output path
results_path = "/kaggle/working/model_performance.csv"

# Save results to CSV
results_df.to_csv(results_path, index=False)

print("\nPerformance Comparison Table:")
print(results_df)
print(f"\nPerformance results saved to {results_path}!")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.metrics import confusion_matrix

# Define the function to plot and save confusion matrices
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Progressor", "Progressor"], yticklabels=["Non-Progressor", "Progressor"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - {model_name}")
    
    # Save the plot
    output_path = f"/kaggle/working/conf_matrix_{model_name.replace(' ', '_')}.png"
    plt.savefig(output_path)
    plt.show()
    
    print(f"Confusion matrix saved at: {output_path}")
# Ensure predictions exist
y_pred_fusion = fusion_model.predict(fusion_model_preds).round().astype(int)

# Now plot confusion matrices
plot_confusion_matrix(y_test_miRNA, y_pred_miRNA, "miRNA Model")
plot_confusion_matrix(y_test_xray, y_pred_xray, "X-ray Model")
plot_confusion_matrix(y_test_fusion, y_pred_fusion, "Fusion Model")




In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


# Function to plot and save feature importance
def plot_feature_importance(model, feature_names, title="Feature Importance", filename="feature_importance.png"):

    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(8, 5))
    plt.title(title)
    plt.barh(range(len(indices)), importances[indices], align="center")
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel("Relative Importance")
    plt.gca().invert_yaxis()
    
    # Save the plot
    output_path = f"/kaggle/working/{filename}"
    plt.savefig(output_path)
    plt.show()
    
    print(f"Feature importance plot saved at: {output_path}")

# Feature Importance for miRNA Model
plot_feature_importance(best_model, selected_features, "Feature Importance - miRNA Model", "feature_importance_miRNA.png")

# Ensure fusion_model is defined before calling this
plot_feature_importance(fusion_model, ["miRNA_pred", "Xray_pred"], "Feature Importance - Fusion Model", "feature_importance_fusion.png")


plot_feature_importance(best_model, selected_features, "Feature Importance - miRNA Model", "feature_importance_miRNA.png")

