<a href="https://colab.research.google.com/github/Shivavarma11/Cancer-classification-thorough-biomarker-identification/blob/main/cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Default title text
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import mutual_info_classif
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.spatial.distance import mahalanobis
import random

# Step 1: Simulate Gene Expression Dataset
# Create a synthetic dataset with 2000 genes and 62 samples (as in the Colon dataset)
np.random.seed(42)
data, labels = make_classification(n_samples=62, n_features=2000, n_informative=10,
                                   n_classes=2, random_state=42)

print("Initial dataset shape:", data.shape)

# Step 2: Feature Selection Using Mutual Information (mRMRe)
def select_informative_genes(data, labels, top_k=10):
    mi_scores = mutual_info_classif(data, labels)
    top_gene_indices = mi_scores.argsort()[-top_k:]
    return data[:, top_gene_indices], top_gene_indices

selected_data, selected_gene_indices = select_informative_genes(data, labels, top_k=10)
print("Selected informative genes indices:", selected_gene_indices)
print("Reduced dataset shape:", selected_data.shape)

# Step 3: Genetic Algorithm for Optimal Gene Selection
def initialize_population(size, num_features):
    return np.random.randint(2, size=(size, num_features))


def fitness_function(population, data, labels):
    scores = []
    for individual in population:
        selected_features = [i for i, bit in enumerate(individual) if bit == 1]
        if not selected_features:
            scores.append(0)  # Penalize if no features are selected
            continue
        reduced_data = data[:, selected_features]
        accuracy = evaluate_svm_accuracy(reduced_data, labels)
        scores.append(accuracy)
    return np.array(scores)


def evaluate_svm_accuracy(data, labels):
    if data.shape[1] == 0:
        return 0
    model = SVC(kernel='rbf')
    model.fit(data, labels)
    predictions = model.predict(data)
    return accuracy_score(labels, predictions)


def genetic_algorithm(data, labels, num_generations=50, population_size=20, crossover_prob=0.8, mutation_prob=0.1):
    num_features = data.shape[1]
    population = initialize_population(population_size, num_features)

    for generation in range(num_generations):
        fitness_scores = fitness_function(population, data, labels)

        # Selection based on fitness
        selected_indices = np.argsort(fitness_scores)[-int(population_size / 2):]
        mating_pool = population[selected_indices]

        # Crossover
        next_generation = []
        while len(next_generation) < population_size:
            parent1, parent2 = random.choices(mating_pool, k=2)
            if random.random() < crossover_prob:
                crossover_point = random.randint(1, num_features - 1)
                child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
                child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
                next_generation.extend([child1, child2])
            else:
                next_generation.extend([parent1, parent2])

        # Mutation
        for individual in next_generation:
            if random.random() < mutation_prob:
                mutation_point = random.randint(0, num_features - 1)
                individual[mutation_point] = 1 - individual[mutation_point]

        population = np.array(next_generation[:population_size])

        # Display generation info
        best_fitness = np.max(fitness_scores)
        print(f"Generation {generation + 1}: Best fitness score = {best_fitness:.4f}")

    # Return the best individual
    best_individual_idx = np.argmax(fitness_scores)
    return population[best_individual_idx]

# Run Genetic Algorithm
best_solution = genetic_algorithm(selected_data, labels)
selected_features = [i for i, bit in enumerate(best_solution) if bit == 1]
print("Best gene indices after GA:", selected_features)
print("Final selected gene expression shape:", selected_data[:, selected_features].shape)

# Step 4: Classification with SVM
final_data = selected_data[:, selected_features]
final_model = SVC(kernel='rbf')
final_model.fit(final_data, labels)
predictions = final_model.predict(final_data)

accuracy = accuracy_score(labels, predictions)
conf_matrix = confusion_matrix(labels, predictions)

print("Final Classification Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)


Initial dataset shape: (62, 2000)
Selected informative genes indices: [ 357 1555 1129 1575  105   68 1505 1989  664 1449]
Reduced dataset shape: (62, 10)
Generation 1: Best fitness score = 0.9355
Generation 2: Best fitness score = 0.9355
Generation 3: Best fitness score = 0.9516
Generation 4: Best fitness score = 0.9355
Generation 5: Best fitness score = 0.9355
Generation 6: Best fitness score = 0.9516
Generation 7: Best fitness score = 0.9355
Generation 8: Best fitness score = 0.9516
Generation 9: Best fitness score = 0.9516
Generation 10: Best fitness score = 0.9516
Generation 11: Best fitness score = 0.9516
Generation 12: Best fitness score = 0.9516
Generation 13: Best fitness score = 0.9516
Generation 14: Best fitness score = 0.9516
Generation 15: Best fitness score = 0.9516
Generation 16: Best fitness score = 0.9516
Generation 17: Best fitness score = 0.9516
Generation 18: Best fitness score = 0.9516
Generation 19: Best fitness score = 0.9516
Generation 20: Best fitness score = 0.

In [None]:
# Classify and display results for each sample
predictions = final_model.predict(final_data)

for i, prediction in enumerate(predictions):
    result = "Cancer" if prediction == 1 else "Healthy"
    print(f"Sample {i + 1}: {result}")

Sample 1: Healthy
Sample 2: Cancer
Sample 3: Cancer
Sample 4: Healthy
Sample 5: Healthy
Sample 6: Cancer
Sample 7: Healthy
Sample 8: Cancer
Sample 9: Healthy
Sample 10: Healthy
Sample 11: Healthy
Sample 12: Cancer
Sample 13: Cancer
Sample 14: Cancer
Sample 15: Cancer
Sample 16: Healthy
Sample 17: Healthy
Sample 18: Cancer
Sample 19: Healthy
Sample 20: Healthy
Sample 21: Healthy
Sample 22: Cancer
Sample 23: Cancer
Sample 24: Cancer
Sample 25: Healthy
Sample 26: Healthy
Sample 27: Cancer
Sample 28: Cancer
Sample 29: Cancer
Sample 30: Healthy
Sample 31: Cancer
Sample 32: Healthy
Sample 33: Healthy
Sample 34: Cancer
Sample 35: Cancer
Sample 36: Cancer
Sample 37: Cancer
Sample 38: Healthy
Sample 39: Healthy
Sample 40: Healthy
Sample 41: Healthy
Sample 42: Healthy
Sample 43: Cancer
Sample 44: Healthy
Sample 45: Healthy
Sample 46: Healthy
Sample 47: Cancer
Sample 48: Healthy
Sample 49: Healthy
Sample 50: Cancer
Sample 51: Cancer
Sample 52: Healthy
Sample 53: Cancer
Sample 54: Healthy
Sample 5

In [None]:
# SVM classifier for cancer stage prediction
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(final_data, labels, test_size=0.2, random_state=42)

# Train the model for multiclass classification
stage_classifier = SVC(kernel='rbf')
stage_classifier.fit(X_train, y_train)

# Make predictions
stage_predictions = stage_classifier.predict(X_test)

# Show detailed report
print("Cancer Stage Classification Report:")
unique_classes = np.unique(y_test)
target_names = [f"Stage {stage}" for stage in unique_classes]
print(classification_report(y_test, stage_predictions, target_names=target_names))



Cancer Stage Classification Report:
              precision    recall  f1-score   support

     Stage 0       0.78      1.00      0.88         7
     Stage 1       1.00      0.67      0.80         6

    accuracy                           0.85        13
   macro avg       0.89      0.83      0.84        13
weighted avg       0.88      0.85      0.84        13



In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import random

# Step 1: Simulate Gene Expression Dataset
np.random.seed(42)
data, labels = make_classification(n_samples=100, n_features=2000, n_informative=10,
                                   n_classes=2, random_state=42)
print("Initial dataset shape:", data.shape)

# Step 1: Binary Classification (Healthy vs Cancer)
def binary_classification(data, labels):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Train SVM for binary classification
    binary_model = SVC(kernel='rbf')
    binary_model.fit(X_train, y_train)
    binary_predictions = binary_model.predict(X_test)

    # Display results
    accuracy = accuracy_score(y_test, binary_predictions)
    print("Binary Classification (Healthy vs Cancer) Accuracy:", accuracy)
    print("Confusion Matrix:\n", confusion_matrix(y_test, binary_predictions))

    return binary_model, X_test, binary_predictions, y_test

binary_model, X_test, binary_predictions, y_test = binary_classification(data, labels)

# Step 2: Simulate Cancer Stages Dataset
# Assume stages are labeled as 0 (Stage 0), 1 (Stage I), 2 (Stage II), 3 (Stage III), 4 (Stage IV)
np.random.seed(42)
stage_labels = np.random.randint(0, 5, size=data.shape[0])

# Use only cancer samples for stage classification
cancer_indices = np.where(labels == 1)[0]
stage_data = data[cancer_indices]
stage_labels = stage_labels[cancer_indices]

# Multiclass Cancer Stage Prediction
def cancer_stage_classification(stage_data, stage_labels):
    X_train, X_test, y_train, y_test = train_test_split(stage_data, stage_labels, test_size=0.2, random_state=42)

    # Train SVM for multiclass classification
    stage_classifier = SVC(kernel='rbf')
    stage_classifier.fit(X_train, y_train)

    # Make predictions
    stage_predictions = stage_classifier.predict(X_test)

    # Dynamic target names based on available classes
    unique_classes = sorted(np.unique(y_test))
    target_names = [f"Stage {stage}" for stage in unique_classes]
    class_to_name_mapping = dict(zip(unique_classes, target_names))

    # Display classification report with zero_division parameter to handle precision warnings
    print("Cancer Stage Classification Report:")
    print(classification_report(y_test, stage_predictions, target_names=target_names, zero_division=1))

    return stage_classifier, class_to_name_mapping

# Run stage classification only for detected cancer cases
stage_classifier, class_to_name_mapping = cancer_stage_classification(stage_data, stage_labels)

# Final Prediction Integration
print("\nFinal Predictions:")
for i in range(len(X_test)):
    if binary_predictions[i] == 0:
        print(f"Sample {i + 1}: Healthy")
    else:
        stage_prediction = stage_classifier.predict(X_test[i].reshape(1, -1))
        stage_name = class_to_name_mapping.get(stage_prediction[0], "Unknown Stage")
        print(f"Sample {i + 1}: Cancer - Predicted Stage: {stage_name}")


Initial dataset shape: (100, 2000)
Binary Classification (Healthy vs Cancer) Accuracy: 0.6
Confusion Matrix:
 [[8 2]
 [6 4]]
Cancer Stage Classification Report:
              precision    recall  f1-score   support

     Stage 0       1.00      0.00      0.00         3
     Stage 1       1.00      0.00      0.00         3
     Stage 3       0.20      1.00      0.33         2
     Stage 4       1.00      0.00      0.00         2

    accuracy                           0.20        10
   macro avg       0.80      0.25      0.08        10
weighted avg       0.84      0.20      0.07        10


Final Predictions:
Sample 1: Cancer - Predicted Stage: Stage 3
Sample 2: Healthy
Sample 3: Healthy
Sample 4: Healthy
Sample 5: Healthy
Sample 6: Healthy
Sample 7: Healthy
Sample 8: Cancer - Predicted Stage: Stage 4
Sample 9: Cancer - Predicted Stage: Stage 3
Sample 10: Cancer - Predicted Stage: Stage 3
Sample 11: Healthy
Sample 12: Healthy
Sample 13: Healthy
Sample 14: Healthy
Sample 15: Healthy
Samp