In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO # pip install BioPython
import re
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout, Conv1D
from tensorflow.keras.optimizers import Adam
from tcn import TCN  # pip install keras-tcn
from sklearn.model_selection import train_test_split
import gdown # pip install gdown

In [15]:
# os.makedirs("data", exist_ok=True)

# # 1. DNA Sequence
# fasta_file_id = "11I3wA5XpIBwIKSWJAqBuLR490G-OxbwY"
# fasta_output = "data/Cosmic_Genes_v102_GRCh38.fasta"
# gdown.download(f"https://drive.google.com/uc?id={fasta_file_id}", fasta_output, quiet=False)

# # 2. BRCA1
# brca1_file_id = "1_fGHi4ifXKBiMOZyUc3kPqk3QqQGd8xu"
# brca1_output = "data/CellLinesProject_GenomeScreensMutant_Tsv_v102_GRCh38"
# gdown.download(f"https://drive.google.com/uc?id={brca1_file_id}", brca1_output, quiet=False)

# # 3. BRCA2
# brca2_file_id = "1YKFPQbGMC9kiOG4lTNjteNu1fdrWqVDy"
# brca2_output = "data/Cosmic_GenomeScreensMutant_Tsv_v102_GRCh38"
# gdown.download(f"https://drive.google.com/uc?id={brca2_file_id}", brca2_output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=11I3wA5XpIBwIKSWJAqBuLR490G-OxbwY
To: C:\data\Cosmic_Genes_v102_GRCh38.fasta
100%|█████████████████████████████████████████████████████████████████████████████| 92.8M/92.8M [00:24<00:00, 3.73MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_fGHi4ifXKBiMOZyUc3kPqk3QqQGd8xu
To: C:\data\CellLinesProject_GenomeScreensMutant_Tsv_v102_GRCh38
100%|██████████████████████████████████████████████████████████████████████████████| 29.7k/29.7k [00:00<00:00, 867kB/s]
Downloading...
From: https://drive.google.com/uc?id=1YKFPQbGMC9kiOG4lTNjteNu1fdrWqVDy
To: C:\data\Cosmic_GenomeScreensMutant_Tsv_v102_GRCh38
100%|█████████████████████████████████████████████████████████████████████████████| 65.4k/65.4k [00:00<00:00, 1.07MB/s]


'data/Cosmic_GenomeScreensMutant_Tsv_v102_GRCh38'

In [16]:
def parse_fasta_header(header):
    parts = header.split()
    transcript_id = parts[1]  # ENST00000641515.2
    chrom_info = parts[2]     # '1:65419-71585(+)'
    chrom, rest = chrom_info.split(":")
    start_end = re.match(r'(\d+-\d+)', rest).group(1)
    start, end = map(int, start_end.split("-"))
    strand = rest[-2]  # karakter sebelum ')'
    return transcript_id, chrom, start, end, strand

In [17]:
def load_fasta_sequences_with_pos(fasta_path):
    seq_dict = {}
    pos_dict = {}
    for record in SeqIO.parse(fasta_path, "fasta"):
        transcript_id, chrom, start, end, strand = parse_fasta_header(record.description)
        seq_dict[transcript_id] = str(record.seq)
        pos_dict[transcript_id] = (start, end, strand)
    return seq_dict, pos_dict

In [18]:
def load_mutation_data(cellline_path, cosmic_path):
    df1 = pd.read_csv(cellline_path, sep='\t')
    df2 = pd.read_csv(cosmic_path, sep='\t')
    df = pd.concat([df1, df2], ignore_index=True)
    df = df.dropna(subset=["TRANSCRIPT_ACCESSION", "GENOME_START"])
    return df

In [19]:
def label_mutations_on_seq(mutations_df, seq_pos_dict, sequences):
    X = []
    y = []

    for transcript_id in mutations_df['TRANSCRIPT_ACCESSION'].unique():
        if transcript_id not in sequences or transcript_id not in seq_pos_dict:
            continue
        seq = sequences[transcript_id]
        seq_len = len(seq)
        start_genome, end_genome, strand = seq_pos_dict[transcript_id]

        labels = np.zeros(seq_len, dtype=int)
        muts = mutations_df[mutations_df['TRANSCRIPT_ACCESSION'] == transcript_id]

        for _, mut in muts.iterrows():
            mut_pos_genome = mut['GENOME_START']

            pos_in_seq = mut_pos_genome - start_genome
            if strand == '-':
                pos_in_seq = (end_genome - mut_pos_genome)

            if 0 <= pos_in_seq < seq_len:
                labels[pos_in_seq] = 1

        X.append(seq)
        y.append(labels)

    return X, y

In [20]:
def sequence_to_integer(seq):
    mapping = {'A':0, 'C':1, 'G':2, 'T':3, 'N':4}
    return np.array([mapping.get(base.upper(), 4) for base in seq])

In [21]:
def encode_and_pad_sequences(X_seqs, y_labels, max_len=None):
    X_int = [sequence_to_integer(seq) for seq in X_seqs]

    if max_len is None:
        max_len = max(len(s) for s in X_int)

    X_pad = pad_sequences(X_int, maxlen=max_len, padding='post', value=0)
    y_pad = pad_sequences(y_labels, maxlen=max_len, padding='post', value=0)

    return np.array(X_pad), np.array(y_pad), max_len

In [22]:
def build_tcn_model(vocab_size, embedding_dim, max_len, num_classes=2):
    model = Sequential([
        Embedding(vocab_size, embedding_dim),
        TCN(nb_filters=64, kernel_size=3, dilations=[1, 2, 4, 8], padding='causal', use_skip_connections=True, return_sequences=True),
        Dropout(0.3),
        TimeDistributed(Dense(num_classes, activation='softmax'))
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
    return model

In [23]:
def build_1dcnn_model(vocab_size, embedding_dim, max_len, num_classes=2):
    model = Sequential([
        Embedding(vocab_size, embedding_dim),
        Conv1D(64, kernel_size=5, activation='relu', padding='same'),
        Dropout(0.3),
        Conv1D(128, kernel_size=5, activation='relu', padding='same'),
        Dropout(0.3),
        Conv1D(num_classes, kernel_size=1, activation='softmax', padding='same'),
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
    return model

In [24]:
def build_bilstm_model(vocab_size, embedding_dim, max_len, num_classes=2):
    model = Sequential([
        Embedding(vocab_size, embedding_dim),
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.3),
        Dense(num_classes, activation='softmax'),
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
    return model

In [27]:
def main():
    fasta_path = fasta_output
    brca1_path = brca1_output
    brca2_path = brca2_output

    print("Loading fasta sequences...")
    sequences, seq_pos_dict = load_fasta_sequences_with_pos(fasta_path)
    print(f"Loaded {len(sequences)} sequences.")

    print("Loading mutation data...")
    mutations_df = load_mutation_data(brca1_path, brca2_path)
    print(f"Loaded {len(mutations_df)} mutation records.")

    print("Labeling mutations on sequences...")
    X_seqs, y_labels = label_mutations_on_seq(mutations_df, seq_pos_dict, sequences)
    print(f"Prepared {len(X_seqs)} sequences with labels.")

    print("Encoding and padding sequences...")
    X_pad, y_pad, max_len = encode_and_pad_sequences(X_seqs, y_labels)
    print(f"Dataset shape X: {X_pad.shape}, y: {y_pad.shape}")

    print("Splitting dataset into train/test (50/50)...")
    X_train, X_test, y_train, y_test = train_test_split(X_pad, y_pad, test_size=0.5, random_state=42)

    vocab_size = 5  # A,T,G,C,N
    embedding_dim = 8
    num_classes = 2

    print("Building TCN model...")
    tcn_model = build_tcn_model(vocab_size, embedding_dim, max_len, num_classes)
    tcn_model.summary()
    tcn_model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=32)

    print("Building 1D-CNN model...")
    cnn_model = build_1dcnn_model(vocab_size, embedding_dim, max_len, num_classes)
    cnn_model.summary()
    cnn_model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=32)

    print("Building BiLSTM model...")
    bilstm_model = build_bilstm_model(vocab_size, embedding_dim, max_len, num_classes)
    bilstm_model.summary()
    bilstm_model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=32)

    print("Evaluating models on test set...")
    print("TCN:", tcn_model.evaluate(X_test, y_test))
    print("1D-CNN:", cnn_model.evaluate(X_test, y_test))
    print("BiLSTM:", bilstm_model.evaluate(X_test, y_test))

In [28]:
if __name__ == "__main__":
    main()

Loading fasta sequences...
Loaded 56474 sequences.
Loading mutation data...
Loaded 300 mutation records.
Labeling mutations on sequences...
Prepared 12 sequences with labels.
Encoding and padding sequences...
Dataset shape X: (12, 10257), y: (12, 10257)
Splitting dataset into train/test (50/50)...
Building TCN model...
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 8)           40        
                                                                 
 tcn_2 (TCN)                 (None, None, 64)          88640     
                                                                 
 dropout_8 (Dropout)         (None, None, 64)          0         
                                                                 
 time_distributed_2 (TimeDi  (None, None, 2)           130       
 stributed)                                                    