<a href="https://colab.research.google.com/github/Shradha-vid/ClinVarHealthcareDataPrediction/blob/main/PCA_ClinVar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[4]:


import pandas as pd
import numpy as np
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Path to the Excel file
file_path = r"/content/genetic_dataset_clinvar.xlsx"

# Load the dataset
print("Loading dataset...")
data = pd.read_excel(file_path)

# Assuming the last column is the target (labels), and others are features
features = data.iloc[:, :-1]
labels = data.iloc[:, -1]

# Encode all text columns in features using Label Encoding
for col in features.columns:
    encoder = LabelEncoder()
    features[col] = encoder.fit_transform(features[col].astype(str))

# Encode labels (target) using Label Encoding and convert to one-hot
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(labels.astype(str))
labels_one_hot = to_categorical(integer_encoded)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(features, labels_one_hot, test_size=0.2, random_state=42)

# Automatically detect input shape and number of output classes
input_shape = x_train.shape[1]
output_classes = y_train.shape[1]

# L1 Regularization Model
print("\nL1 Regularization:")
model_l1 = models.Sequential([
    layers.Input(shape=(input_shape,)),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l1(0.001)),
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l1(0.001)),
    layers.Dense(output_classes, activation='softmax')
])

model_l1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_l1.summary()

# Train the L1 model
history_l1 = model_l1.fit(x_train, y_train,
                          epochs=20,
                          batch_size=128,
                          validation_split=0.2)

# Evaluate the L1 model
test_loss_l1, test_acc_l1 = model_l1.evaluate(x_test, y_test, verbose=0)
print(f"L1 - Test Accuracy: {test_acc_l1:.4f}, Test Loss: {test_loss_l1:.4f}")

train_loss_l1, train_acc_l1 = model_l1.evaluate(x_train, y_train, verbose=0)
print(f"L1 - Train Accuracy: {train_acc_l1:.4f}, Train Loss: {train_loss_l1:.4f}")

# L2 Regularization Model
print("\nL2 Regularization:")
model_l2 = models.Sequential([
    layers.Input(shape=(input_shape,)),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dense(output_classes, activation='softmax')
])

model_l2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_l2.summary()

# Train the L2 model
history_l2 = model_l2.fit(x_train, y_train,
                          epochs=20,
                          batch_size=128,
                          validation_split=0.2)

# Evaluate the L2 model
test_loss_l2, test_acc_l2 = model_l2.evaluate(x_test, y_test, verbose=0)
print(f"L2 - Test Accuracy: {test_acc_l2:.4f}, Test Loss: {test_loss_l2:.4f}")

train_loss_l2, train_acc_l2 = model_l2.evaluate(x_train, y_train, verbose=0)
print(f"L2 - Train Accuracy: {train_acc_l2:.4f}, Train Loss: {train_loss_l2:.4f}")






Loading dataset...

L1 Regularization:


Epoch 1/20


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m402/404[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 1.6571

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 1.0000 - loss: 1.6486 - val_accuracy: 1.0000 - val_loss: 0.0180
Epoch 2/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0180 - val_accuracy: 1.0000 - val_loss: 0.0179
Epoch 3/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0180 - val_accuracy: 1.0000 - val_loss: 0.0180
Epoch 4/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0180 - val_accuracy: 1.0000 - val_loss: 0.0178
Epoch 5/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 1.0000 - loss: 0.0180 - val_accuracy: 1.0000 - val_loss: 0.0180
Epoch 6/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 1.0000 - loss: 0.0180 - val

Epoch 1/20


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m398/404[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0849

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 1.0000 - loss: 0.0839 - val_accuracy: 1.0000 - val_loss: 5.8270e-15
Epoch 2/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 1.0000 - loss: 8.3862e-16 - val_accuracy: 1.0000 - val_loss: 3.5369e-33
Epoch 3/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 1.0000 - loss: 3.5760e-34 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/20
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - a