In [None]:
!pip install Bio
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Dropout,Flatten,Dense
from tensorflow.keras.metrics import binary_crossentropy,categorical_crossentropy
from tensorflow.keras.optimizers.experimental import Adam,Adadelta
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import multilabel_confusion_matrix
import joblib

In [4]:
import os
print(os.getcwd())

/kaggle/working


In [None]:
%ls /kaggle/working

In [None]:

def extract_kmers(sequence, k):
    kmers = []
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        kmers.append(kmer)
    return kmers

In [None]:
def create_kmer_matrix(fasta_data, all_kmers_set, k):
    num_sequences = len(fasta_data)
    num_unique_kmers = len(all_kmers_set)
    kmer_matrix = np.zeros((num_sequences, num_unique_kmers), dtype=int)

    for i, record in enumerate(fasta_data):
        sequence = str(record.seq).upper()
        kmers = extract_kmers(sequence, k)
        for j, kmer in enumerate(all_kmers_set):
            kmer_matrix[i, j] = kmers.count(kmer)
    return kmer_matrix

In [None]:
def normalize(mRNA_kmer_matrix,lncRNA_kmer_matrix):
    max=mRNA_kmer_matrix.max() if mRNA_kmer_matrix.max()>lncRNA_kmer_matrix.max() else lncRNA_kmer_matrix.max()
    mRNA_kmer_matrix=mRNA_kmer_matrix/max
    lncRNA_kmer_matrix=lncRNA_kmer_matrix/max
    ones_column = np.ones((mRNA_kmer_matrix.shape[0], 1))
    zeros_column=np.zeros((lncRNA_kmer_matrix.shape[0],1))
    mRNA_kmer_matrix = np.hstack((mRNA_kmer_matrix, ones_column))
    lncRNA_kmer_matrix = np.hstack((lncRNA_kmer_matrix, zeros_column))
    df_mrna = pd.DataFrame(mRNA_kmer_matrix)
    df_lncRNA=pd.DataFrame(lncRNA_kmer_matrix)
    return df_mrna,df_lncRNA

In [None]:
def preprocess(mRNA_loc,lncRNA_loc):

    
    mRNA_data = list(SeqIO.parse(mRNA_loc, 'fasta'))
    lncRNA_data = list(SeqIO.parse(lncRNA_loc, 'fasta'))
    combined_sequences = [str(record.seq).upper() for record in mRNA_data] + [str(record.seq).upper() for record in lncRNA_data]
    
    all_kmers_set_1 = set()
    k = 1 
    all_kmers_set_2 = set()
    k = 2  
    all_kmers_set_3 = set()
    k = 3 
    all_kmers_set_4 = set()
    k = 4  
    all_kmers_set_5 = set()
    k = 5  

    for sequence in combined_sequences:
        kmers_1 = extract_kmers(sequence, 1)
        all_kmers_set_1.update(kmers_1)
        kmers_2 = extract_kmers(sequence, 2)
        all_kmers_set_2.update(kmers_2)
        kmers_3 = extract_kmers(sequence, 3)
        all_kmers_set_3.update(kmers_3)
        kmers_4 = extract_kmers(sequence, 4)
        all_kmers_set_4.update(kmers_4)
        kmers_5 = extract_kmers(sequence, 5)
        all_kmers_set_5.update(kmers_5)
        
        
    mRNA_kmer_matrix_1 = create_kmer_matrix(mRNA_data, all_kmers_set_1, 1)
    lncRNA_kmer_matrix_1 = create_kmer_matrix(lncRNA_data, all_kmers_set_1, 1)

    mRNA_kmer_matrix_2 = create_kmer_matrix(mRNA_data, all_kmers_set_2, 2)
    lncRNA_kmer_matrix_2 = create_kmer_matrix(lncRNA_data, all_kmers_set_2, 2)

    mRNA_kmer_matrix_3 = create_kmer_matrix(mRNA_data, all_kmers_set_3, 3)
    lncRNA_kmer_matrix_3 = create_kmer_matrix(lncRNA_data, all_kmers_set_3, 3)

    mRNA_kmer_matrix_4 = create_kmer_matrix(mRNA_data, all_kmers_set_4, 4)
    lncRNA_kmer_matrix_4 = create_kmer_matrix(lncRNA_data, all_kmers_set_4, 4)

    mRNA_kmer_matrix_5 = create_kmer_matrix(mRNA_data, all_kmers_set_5, 5)
    lncRNA_kmer_matrix_5 = create_kmer_matrix(lncRNA_data, all_kmers_set_5, 5)
    df_mrna_1,df_lncRNA_1 = normalize(mRNA_kmer_matrix_1,lncRNA_kmer_matrix_1)
    df_mrna_2,df_lncRNA_2 = normalize(mRNA_kmer_matrix_2,lncRNA_kmer_matrix_2)
    df_mrna_3,df_lncRNA_3 = normalize(mRNA_kmer_matrix_3,lncRNA_kmer_matrix_3)
    df_mrna_4,df_lncRNA_4 = normalize(mRNA_kmer_matrix_4,lncRNA_kmer_matrix_4)
    df_mrna_5,df_lncRNA_5 = normalize(mRNA_kmer_matrix_5,lncRNA_kmer_matrix_5)
    
    df1=pd.concat([df_mrna_1, df_lncRNA_1], ignore_index=True)
    df2=pd.concat([df_mrna_2, df_lncRNA_2], ignore_index=True)
    df3=pd.concat([df_mrna_3, df_lncRNA_3], ignore_index=True)
    df4=pd.concat([df_mrna_4, df_lncRNA_4], ignore_index=True)
    df5=pd.concat([df_mrna_5, df_lncRNA_5], ignore_index=True)
    
    return df1,df2,df3,df4,df5

In [None]:
def split_cnn(df):

    X = df.iloc[:, :-1].values 
    y = df.iloc[:, -1].values  
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    dim=int((X.shape[1])**(1/2))
    X_train = X_train.reshape(-1, dim,dim, 1)
    X_val=X_val.reshape(-1,dim,dim,1)
    y_train_cat=to_categorical(y_train)
    y_val=to_categorical(y_val)
    return X_train,X_val,y_train_cat,y_val

In [None]:
def split_ml(df):
    X = df.iloc[:, :-1].values 
    y = df.iloc[:, -1].values  
    le = LabelEncoder()
    y=le.fit_transform(y)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train,X_val,y_train,y_val