<a href="https://colab.research.google.com/github/Tansiv/MULTI_OPTICS_PROJECT/blob/main/phase_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Install missing dependencies (if any)
!pip install shap lime transformers

# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import shap
import lime
import torch
from transformers import AutoModel, AutoTokenizer

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries Loaded Successfully!")


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=ddf20d2446b0197a192af3c1e9c576dc46ad933ac4364ded138c13b2acafdbf7
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
✅ Libraries Loaded Successfully!


In [10]:
#Load Cleaned Multi-Omics Datasets

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define dataset paths (Modify this path based on your Google Drive location)
base_path = "/content/drive/MyDrive/Personal Dataset folder/Multi-Omics Data/datasets"
genomic_file = base_path + "/Genomic_Cleaned.csv" # Corrected path
proteomic_file = base_path + "/Proteomic_Cleaned.csv" # Corrected path
transcriptomic_file = base_path + "/Transcriptomic_Cleaned.csv" # Corrected path

# Load datasets
df_genomic = pd.read_csv(genomic_file)
df_proteomic = pd.read_csv(proteomic_file)
df_transcriptomic = pd.read_csv(transcriptomic_file)

print("✅ Datasets Loaded Successfully!")
print(f"Genomic Shape: {df_genomic.shape}")
print(f"Proteomic Shape: {df_proteomic.shape}")
print(f"Transcriptomic Shape: {df_transcriptomic.shape}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Datasets Loaded Successfully!
Genomic Shape: (12038, 18)
Proteomic Shape: (5531, 16915)
Transcriptomic Shape: (3875, 33)


In [11]:
# Data Preprocessing (Handle Missing Values, Scaling)

# Function to preprocess data
def preprocess_data(df):
    # Drop columns with too many missing values (optional)
    df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

    # Fill missing values with median (for numerical) or mode (for categorical)
    for col in df.columns:
        if df[col].dtype == 'object':  # Categorical
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:  # Numerical
            df[col].fillna(df[col].median(), inplace=True)

    # Encode categorical variables
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    # Scale numerical features
    scaler = StandardScaler()
    df[df.columns] = scaler.fit_transform(df)

    return df

# Apply preprocessing
df_genomic = preprocess_data(df_genomic)
df_proteomic = preprocess_data(df_proteomic)
df_transcriptomic = preprocess_data(df_transcriptomic)

print("✅ Data Preprocessed Successfully!")


✅ Data Preprocessed Successfully!


In [12]:
# Merge Multi-Omics Data

# Merge datasets based on a common key (modify as per actual dataset structure)
df_merged = pd.concat([df_genomic, df_proteomic, df_transcriptomic], axis=1)

# Shuffle the data
df_merged = df_merged.sample(frac=1).reset_index(drop=True)

print(f"✅ Final Merged Dataset Shape: {df_merged.shape}")


✅ Final Merged Dataset Shape: (12038, 16966)


In [15]:
#Check Available Columns
print("Columns in df_merged:", df_merged.columns)



Columns in df_merged: Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID',
       ...
       'NR_33WT_28_1', 'NR_33WT_28_2', 'NR_34WT_25_1', 'NR_34WT_25_2',
       'NR_34WT_29_1', 'NR_34WT_29_2', 'NR_35WT_26_1', 'NR_35WT_26_2',
       'NR_35WT_30_1', 'NR_35WT_30_2'],
      dtype='object', length=16966)


In [16]:
#Verify If the Target Column Exists in Any Dataset
print("Columns in Genomic:", df_genomic.columns)
print("Columns in Proteomic:", df_proteomic.columns)
print("Columns in Transcriptomic:", df_transcriptomic.columns)


Columns in Genomic: Index(['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession',
       'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome',
       'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID',
       'Canonical SPDI', 'Variant type', 'Molecular consequence',
       'Germline classification', 'Germline date last evaluated',
       'Germline review status'],
      dtype='object')
Columns in Proteomic: Index(['Unnamed: 0', 'TFE3 (7030)', 'MMADHC (27249)', 'PABPC1L (80336)',
       'RELL2 (285613)', 'GPT (2875)', 'GUSB (2990)', 'DDB2 (1643)',
       'RPS4X (6191)', 'HINFP (25988)',
       ...
       'C14orf93 (60686)', 'NR3C2 (4306)', 'CFAP276 (127003)',
       'OR4C16 (219428)', 'OR5B17 (219965)', 'SLCO1B7 (Unknown)',
       'OR4M1 (441670)', 'ZNF781 (Unknown)', 'RUNX1 (100506403)',
       'BLTP1 (84162)'],
      dtype='object', length=16915)
Columns in Transcriptomic: Index(['gene', 'NR_14DMD_31_1', 'NR_14DMD_31_2', 'NR_14DMD_35_1',
       'NR_14DMD_35_2',

In [19]:
print(df_genomic['Condition(s)'].value_counts())  # Check for labels
print(df_genomic['Germline classification'].value_counts())


Condition(s)
 1.212911    452
 0.563522    448
-0.380314    407
 0.207124    288
-1.095297    194
            ... 
 1.722364      1
-0.297227      1
 1.720906      1
 0.188174      1
 0.237735      1
Name: count, Length: 5121, dtype: int64
Germline classification
-1.241384    4586
 0.877513    3752
 0.612650    3556
 3.261272      53
 0.347788      29
-0.446798      14
-0.976522      14
 2.201823       4
 1.936961       4
 1.142375       4
 3.526134       4
-0.711660       4
 1.407237       3
 2.466685       3
 3.790996       2
 0.082926       2
-0.181936       1
 1.672099       1
 2.996409       1
 2.731547       1
Name: count, dtype: int64


In [20]:
print(df_genomic['Germline classification'].unique())


[ 0.87751253 -1.24138441  0.61265041  0.08292618  1.40723677  1.936961
  3.79099583  2.20182312 -0.18193594  0.3477883  -0.9765223   1.67209889
  2.99640948  1.14237465  3.2612716   2.46668524  3.52613371  2.73154736
 -0.44679806 -0.71166018]


In [21]:
print(df_genomic['Germline classification'].value_counts())


Germline classification
-1.241384    4586
 0.877513    3752
 0.612650    3556
 3.261272      53
 0.347788      29
-0.446798      14
-0.976522      14
 2.201823       4
 1.936961       4
 1.142375       4
 3.526134       4
-0.711660       4
 1.407237       3
 2.466685       3
 3.790996       2
 0.082926       2
-0.181936       1
 1.672099       1
 2.996409       1
 2.731547       1
Name: count, dtype: int64


In [22]:
#Binary Classification (Simple)

#You can divide the values into two categories:

#Negative (0) = Non-disease / benign cases
#Positive (1) = Disease-related cases

df_genomic['Germline_label'] = df_genomic['Germline classification'].apply(lambda x: 1 if x > 0 else 0)




Model train part

In [23]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_genomic.drop(columns=['Germline classification', 'Germline_label'], errors='ignore')  # Drop target columns
y = df_genomic['Germline_label']  # Use the new categorical target column

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("✅ Train-Test Split Done!")
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")


✅ Train-Test Split Done!
Training Data Shape: (9630, 17)
Testing Data Shape: (2408, 17)
