<a href="https://colab.research.google.com/github/Soham-Chaudhuri/ViralAI/blob/main/Viral_Gene_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def gc_content(sequence):
    g_count = sequence.count('G')
    c_count = sequence.count('C')
    total_bases = len(sequence)
    gc_percentage = (g_count + c_count) / total_bases * 100
    return gc_percentage

def at_content(sequence):
    a_count = sequence.count('A')
    t_count = sequence.count('T')
    total_bases = len(sequence)
    at_percentage = (a_count + t_count) / total_bases * 100
    return at_percentage

def kmer_frequencies(sequence, k=3):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    kmer_counts = Counter(kmers)
    return kmer_counts

def molecular_weight(sequence):
    nucleotide_weights = {
        'A': 331.2,
        'T': 322.2,
        'C': 307.2,
        'G': 347.2
    }
    weight = sum(nucleotide_weights[nuc] for nuc in sequence if nuc in nucleotide_weights)
    return weight

def hydrophobicity(sequence):
    hydrophobic_nucleotides = {'A', 'T'}
    hydrophilic_nucleotides = {'C', 'G'}
    hydrophobic_score = sum(1 for nuc in sequence if nuc in hydrophobic_nucleotides)
    hydrophilic_score = sum(1 for nuc in sequence if nuc in hydrophilic_nucleotides)
    return hydrophobic_score / len(sequence), hydrophilic_score / len(sequence)

def net_charge(sequence):
    basic_charge = {'A': 1, 'T': 1}
    acidic_charge = {'C': -1, 'G': -1}
    charge = sum(basic_charge.get(nuc, 0) for nuc in sequence) + \
            sum(acidic_charge.get(nuc, 0) for nuc in sequence)
    return charge

def dinucleotide_frequencies(seq):
    dinucs = [seq[i:i+2] for i in range(len(seq)-1)]
    freq = Counter(dinucs)
    return {dinuc: freq[dinuc] / len(dinucs) for dinuc in freq}

def sequence_entropy(seq):
    freq = Counter(seq)
    probs = [freq[base] / len(seq) for base in freq]
    return -sum(p * np.log2(p) for p in probs)
def Kmers_contribution(seq, size=6):
    return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]

def feature_extraction_pipeline(seq):
    df = pd.DataFrame(columns=['Sequence', 'GC', 'AT', 'Molecular Wt', 'Hydrophobic Score', 'Hydrophilic Score', 'Sequence Entropy'])
    seq = seq.replace('\n', '')
    words = Kmers_contribution(seq)
    joined_words = ' '.join(words)
    gc = gc_content(seq)
    at = at_content(seq)
    mw = molecular_weight(seq)
    hydrophobic_score, hydrophilic_score = hydrophobicity(seq)
    s_entropy = sequence_entropy(seq)
    df.loc[len(df)] = [joined_words, gc, at, mw, hydrophobic_score, hydrophilic_score, s_entropy]
    sequence_features = tfidf.transform(df['Sequence']).toarray()
    sequence_feature_names = tfidf.get_feature_names_out()
    numeric_features = df.drop(columns=['Sequence']).values
    combined_features = np.hstack((sequence_features, numeric_features))
    x_dense = pd.DataFrame(combined_features, columns=sequence_feature_names.tolist() + df.drop(columns=['Sequence']).columns.tolist())
    return x_dense

def transform_features(seq):
  pass

In [None]:
pip install dill

Collecting dill
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m92.2/119.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.9


In [None]:
import joblib
model = joblib.load('Viral_Gene_Prediction_Model.pkl')
model
import dill
with open('Viral_Gene_Prediction_Model_dill.pkl', 'wb') as file:
    dill.dump(model, file)

In [None]:
model['TF-IDF Vectorizer']

In [None]:
tfidf = model['TF-IDF Vectorizer']
scaler = model['Standard Scaler']

In [None]:
dna = "GAATCAGACTGCGACAGTTCGAGTTTGAAGCGAAAGCTAGCAACAGTATCAACAGGTTTTATTTTGGATTTGGAAACGAGAGTTTCTGGTCATGAAAAACCCAAAAAAGAAATCCGGAGGATTCCGGATTGTCAATATGCTAAAACGCGGAGTAGCCCGTGTGAGCCCCTTTGGGGGCTTGAAGAGGCTGCCAGCCGGACTTCTGCTGGGTCATGGGCCCATCAGGATGGTCTTGGCGATTCTAGCCTTTTTGAGGTTCACGGCAATCAAGCCATCACTGGGTCTCATCAATAGATGGGGTTCAGTGGGGAAAAAAGAGGCTATGGAAATAATAAAGAAGTTCAAGAAAGATCTGGCTGCCATGCTGAGAATAATCAATGCTAGGAAGGAGAAGAAGAGACGAGGCGCAGATACTAGTGTCGGAATTGTTGGCCTCCTGCTGACCACAGCTATGGCAGCGGAGGTCACTAGACGTGGGAGTGCATACTATATGTACTTGGACAGAAGCGATGCTGGGGAGGCCATATCTTTTCCAACCACACTGGGGATGAATAAGTGTTATATACAGATCATGGATCTTGGACACATGTGTGATGCCACCATGAGCTATGAATGCCCTATGCTGGATGAGGGGGTAGAACCAGATGACGTCGATTGTTGGTGCAACACGACGTCAGCTTGGGTTGTGTACGGAACCTGCCATCACAAAAAAGGTGAAGCACGGAGATCTAGAAGAGCTGTGACGCTCCCCTCCCATTCCACTAGGAAGCTGCAAACGCGGTCGCAGACCTGGTTGGAGTCAAGAGAATACACAAAGCACTTGATTAGAGTCGAAAATTGGATATTCAGGAACCCTGGCTTCGCGTTAGCAGCAGCTGCCATCGCTTGGCTTTTGGGAAGCTCAACGAGCCAAAAAGTCATATACTTGGTCATGATACTGCTGATTGCCCCGGCATACAGCATCAGGTGCATAGGAGTCAGCAATAGGGACTTTGTGGAAGGTATGTCAGGTGGGACTTGGGTTGATGTTGTCTTGGAACATGGAGGTTGTGTCACCGTAATGGCACAGGACAAACCGACTGTCGACATAGAGCTGGTTACAACAACAGTCAGCAACATGGCGGAGGTAAGATCCTACTGCTATGAGGCATCAATATCGGACATGGCTTCGGACAGCCGCTGCCCAACACAAGGTGAAGCCTACCTTGACAAGCAATCAGATACCCAATATGTCTGCAAAAGAACGTTAGTGGACAGAGGCTGGGGAAATGGATGTGGACTTTTTGGAAAAGGGAGCCTGGTGACATGCGCTAAGTTTGCATGCTCCAAGAAAATGACCGGGAAGAGCATCCAGCCAGAGAATCTGGAGTACCGGATAATGCTGTCAGTCCATGGCTCCCAGCACAGTGGGATGATCGTTAATGACACAGGACATGAAACTGATGAGAATAGAGCGAAGGTTGAGATAACGCCCAATTCACCAAGAGCCGAAGCCACCCTGGGGGGTTTTGGAAGCCTAGGACTTGATTGTGAACCGAGGACAGGCCTTGACTTTTCAGATTTGTATTACTTGACTATGAATAACAAGCACTGGTTGGTTCACAAGGAGTGGTTCCACGACATTCCATTACCTTGGCACGCTGGGGCAGACACCGGAACTCCACACTGGAACAACAAAGAAGCACTGGTAGAGTTCAAGGACGCACATGCCAAAAGGCAAACTGTCGTGGTTCTAGGGAGTCAAGAAGGAGCAGTTCACACGGCCCTTGCTGGAGCTCTGGAGGCTGAGATGGATGGTGCAAAGGGAAGGCTGTCCTCTGGCCACTTGAAATGTCGCCTGAAAATGGACAAACTTAGATTGAAGGGCGTGTCATACTCCTTGTGTACCGCAGCGTTCACATTCACCAAGATCCCGGCTGAAACACTGCACGGGACAGTCACAGTGGAGGTACAGTACGCAGGGACAGATGGACCCTGCAAGGTTCCAGCTCAGATGGCGGTGGACATGCAAACTCTGACCCCAGTTGGGAGGTTGATAACCGCTAACCCCGTAATCACTGAAAGCACTGAGAACTCTAAGATGATGCTGGAACTTGATCCACCATTTGGGGACTCTTACATTGTCATAGGAGTCGGGGAGAAGAAGATCACCCACCACTGGCACAGGAGTGGCAGTACCATTGGAAAAGCATTTGAAGCCACTGTGAGAGGTGCCAAGAGAATGGCAGTCTTGGGAGACACAGCCTGGGACTTTGGATCAGTTGGAGGCGCTCTCAACTCATTGGGCAAGGGCATCCATCAAATTTTTGGAGCAGCTTTCAAATCATTGTTTGGAGGAATGTCCTGGTTCTCACAAATTCTCATTGGAACGTTGCTGATGTGGTTGGGCCTGAACACAAAGAATGGATCTATTTCCCTTATGTGCTTGGCCTTAGGGGGAGTGTTGATCTTCTTATCCACAGCCGTCTCTGCTGATGTGGGGTGCTCGGTGGACTTCTCAAAGAAGGAAACGAGATGCGGTACAGGGGTGTTCGTCTATAACGACGTTGAAGCCTGGAGGGACAGGTACAAGTACCATCCTGACTCCCCCCGTAGATTGGCAGCAGCAGTCAAGCAAGCCTGGGAAGATGGTATCTGTGGGATCTCCTCTGTTTCAAGAATGGAAAACATCATGTGGAGATCAGTAGAAGGGGAGCTCAACGCAATCCTGGAAGAAAATGGAGTTCAACTGACGGTCGTTGTGGGATCTGTAAAAAACCCCATGTGGAGAGGTCCACAGAGACTGCCCGTGCCTGTGAACGAGCTGCCCCACGGCTGGAAGGCTTGGGGGAAATCGTACTTCGTCAGAGCAGCAAAGACAAATAACAGCTTTGTCGTGGATGGTGACACACTGAAGGAATGCCCACTCAAACATAGAGCATGGAACAGCTTTCTTGTGGAGGATCATGGGTTCGGGGTATTTCACACTAGTGTCTGGCTCAAGGTTAGAGAAGATTATTCATTAGAGTGTGATCCTGCCGTTATTGGAACAGCTGTTAAGGGAAAGGAGGCTGCACACAGTGATCTAGGCTACTGGATTGAGAGTGAGAAGAATGACACATGGAGGCTGAAGAGGGCCCACCTGATCGAGATGAAAACATGTGAATGGCCAAAGTCCCACACATTGTGGACAGATGGAATAGAAGAGAGTGATCTGATCATACCCAAGTCTTTAGCTGGGCCACTCAGCCATCACAACACCAGAGAGGGCTACAGGACCCAAATGAAAGGGCCATGGCACAGTGAAGAGCTTGAAATTCGGTTTGAGGAATGCCCAGGCACTAAGGTCCACGTGGAGGAAACATGTGGAACAAGAGGACCGTCTCTGAGATCAACCACTGCAAGCGGAAGGGTGATCGAGGAATGGTGCTGCAGGGAGTGCACAATGCCCCCACTGTCGTTCCGGGCTAAAGATGGCTGTTGGTATGGAATGGAGATAAGGCCCAGGAAAGAACCAGAAAGTAACTTAGTAAGGTCAATGGTGACTGCAGGATCAACTGATCACATGGATCACTTCTCCCTTGGAGTGCTTGTGATTCTGCTCATGGTGCAGGAAGGGCTGAAGAAGAGAATGACCACAAAGATCATCATAAGCACATCAATGGCAGTGCTGGTAGCTATGATCCTGGGAGGATTTTCAATGAGTGACCTGGCTAAGCTTGCAATTTTGATGGGTGCCACCTTCGCGGAAATGAACACTGGAGGAGATGTAGCTCATCTGGCGCTGATAGCGGCATTCAAAGTCAGACCAGCGTTGCTGGTATCTTTCATCTTCAGAGCTAATTGGACACCCCGTGAAAGCATGCTGCTGGCCTTGGCCTCGTGTCTTTTGCAAACTGCGATTTCCGCCTTGGAAGGCGACCTGATGGTTCTCATCAATGGTTTTGCTCTGGCCTGGTTGGCAATACGAGCGATGGTTGTTCCACGCACTGACAACATCACCTTGGCAATCCTGGCTGCTCTGACACCACTGGCCCGGGGCACACTGCTTGTGGCGTGGAGAGCAGGCCTTGCTACTTGCGGGGGGTTCATGCTCCTCTCTCTGAAGGGAAAAGGCAGTGTGAAGAAGAACTTACCATTTGTCATGGCCCTGGGACTAACCGCTGTGAGGCTAGTCGACCCCATCAACGTGGTGGGACTGCTGTTGCTCACAAGGAGTGGGAAGCGGAGCTGGCCCCCTAGCGAAGTACTCACAGCTGTTGGCCTGATATGCGCATTGGCTGGAGGGTTCGCCAAGGCAGATATAGAGATGGCTGGGCCCATGGCCGCGGTTGGTCTGCTAATTGTCAGTTACGTGGTCTCAGGAAAGAGTGTGGACATGTACATTGAAAGAGCAGGTGACATCACATGGGAAAAAGATGCGGAAGTCACTGGAAACAGTCCCCGGCTCGATGTGGCACTAGATGAGAGTGGTGATTTCTCCCTGGTGGAGGATGACGGTCCCCCCATGAGAGAGATCATACTTAAAGTGGTCCTGATGACCATCTGTGGCATGAACCCAATAGCCATACCCTTTGCAGCTGGAGCGTGGTACGTATACGTGAAGACTGGAAAAAGGAGTGGTGCTCTATGGGATGTGCCTGCTCCCAAGGAAGTAAAAAAGGGGGAGACCACAGATGGAGTGTACAGAGTAATGACTCGTAGACTGCTAGGTTCAACACAAGTTGGAGTGGGAGTCATGCAAGAGGGGGTCTTTCACACTATGTGGCACGTCACAAAAGGATCCGCGCTGAGAAGCGGTGAAGGGAGACTTGATCCATACTGGGGAGATGTCAAGCAGGATCTGGTGTCATACTGTGGTCCATGGAAGCTAGATGCCGCCTGGGACGGGCACAGCGAGGTGCAGCTCTTGGCCGTGCCCCCCGGAGAGAGAGCGAGGAACATCCAGACTCTGCCCGGAATATTTAAGACAAAGGATGGGGACATTGGAGCGGTTGCGCTGGACTACCCAGCAGGAACTTCAGGATCTCCAATCCTAGACAAGTGTGGGAGAGTGATAGGACTCTATGGCAATGGGGTCGTGATCAAAAATGGGAGTTATGTTAGTGCCATCACCCAAGGGAGGAGGGAGGAAGAGACTCCTGTTGAGTGCTTCGAGCCTTCGATGCTGAAGAAGAAGCAGCTAACTGTCTTAGACTTGCATCCTGGAGCTGGGAAAACCAGGAGAGTTCTTCCTGAAATAGTCCGTGAAGCCATAAAAACAAGACTCCGTACTGTGATCTTAGCTCCAACTAGGGTTGTCGCTGCTGAAATGGAGGAAGCCCTTAGAGGGCTTCCAGTGCGTTATATGACAACAGCAGTCAATGTCACCCACTCTGGGACAGAAATCGTTGACTTAATGTGCCATGCCACCTTCACTTCACGTCTACTACAGCCAATCAGAGTCCCCAACTATAATCTGTATATTATGGATGAGGCCCACTTCACAGATCCCTCAAGTATAGCAGCAAGAGTATACATTTCAACAAGGGTTGAGATGGGCGAGGCGGCTGCCATCTTCATGACCGCCACGCCACCAGGAACCCGTGACGCATTTCCGGACTCCAACTCACCAATTATGGACACCGAAGTGGAAGTCCCGGAGAGAGCCTGGAGCTCAGGCTTTGATTGGGTGACGGACCATTCTGGAAAAACAGTTTGGTTTGTTCCAAGCGTGAGGAACGGCAATGAGATCGCAGCTTGTCTGACGAAGGCTGGAAAACGGGTCATACAGCTCAGCAGAAAGACTTTTGAGACAGAGTTCCAGAAAACAAAACATCAAGAGTGGGACTTTGTCGTGACAACTGACATTTCAGAGATGGGCGCCAACTTTAAAGCTGACCGTGTCATAGATTCCAGGAGATGCCTAAAGCCGGTCATACTTGATGGCGAGAGAGTCATTCTGGCTGGACCCATGCCTGTCACACATGCCAGCGCTGCCCAGAGGAGGGGGCGCATAGGCAGGAATCCCAACAAACCTGGAGATGAGTATCTGTATGGAGGTGGGTGCGCAGAGACTGATGAAGACCATGCACACTGGCTTGAAGCAAGAATGCTCCTTGACAATATCTACCTCCAAGATGGCCTCATAGCCTCGCTCTATCGACCTGAGGCCGACAAAGTAGCAGCCATTGAGGGAGAGTTCAAGCTTAGGACGGAGCAAAGGAAGACCTTTGTGGAACTCATGAAAAGAGGAGATCTTCCTGTTTGGCTGGCCTATCAGGTTGCATCTGCCGGAATAACCTACACAGATAGAAGATGGTGCTTTGATGGCACGACCAACAACACCATAATGGAAGACAGTGTGCCGGCAGAGGTGTGGACCAGATACGGAGAGAAAAGAGTGCTCAAACCGAGGTGGATGGACGCCAGAGTCTGTTCAGATCATGCGGCCCTGAAGTCATTCAAGGAGTTTGCCGCTGGGAAAAGAGGAGCGGCTTTTGGAGTGATGGAAGCCCTGGGAACACTGCCGGGACACATGACAGAGAGATTCCAGGAAGCCATTGACAACCTCGCTGTGCTCATGCGGGCAGAGACTGGAAGCAGACCTTACAAAGCCGCGGCGGCCCAATTGCCGGAGACCCTAGAGACCATTATGCTTTTGGGGTTGCTGGGAACAGTCTCGCTGGGAATCTTTTTCGTCTTGATGCGGAACAAGGGCATAGGGAAGATGGGCTTTGGAATGGTGACTCTTGGGGCCAGCGCATGGCTCATGTGGCTCTCGGAAATTGAGCCAGCCAGAATTGCATGTGTCCTCATTGTTGTGTTCCTATTGCTGGTGGTGCTCATACCTGAGCCAGAAAAGCAAAGATCTCCCCAGGACAACCAAATGGCAATCATCATCATGGTAGCAGTAGGTCTTCTGGGCTTGATTACCGCCAATGAACTCGGATGGTTGGAGAGAACAAAGAGTGACCTAAGCCATCTAATGGGAAGGAGAGAGGAGGGGGCAACCATAGGATTCTCAATGGACATTGACCTGCGGCCAGCCTCAGCTTGGGCCATCTATGCTGCCCTGACAACTTTCATTACCCCAGCCGTCCAACATGCAGTGACCACTTCATACAACAACTACTCCTTAATGGCGATGGCCACGCAAGCTGGAGTGTTGTTTGGTATGGGCAAAGGGATGCCATTCTACGCATGGGACTTTGGAGTCCCGCTGCTAATGATAGGTTGCTACTCACAATTAACACCCCTGACCCTAATAGTGGCCATCATTTTGCTCGTGGCGCACTACATGTACTTGATCCCAGGGCTGCAGGCAGCAGCTGCGCGTGCTGCCCAGAAGAGAACGGCAGCTGGCATCATGAAGAACCCTGTTGTGGATGGAATAGTGGTGACTGACATTGACACAATGACAATTGACCCCCAAGTGGAGAAAAAGATGGGACAGGTGCTACTCATAGCAGTAGCCGTCTCCAGCGCCATACTGTCGCGGACCGCCTGGGGGTGGGGGGAGGCTGGGGCCCTGATCACAGCTGCAACTTCCACTTTGTGGGAAGGCTCTCCGAACAAGTACTGGAACTCCTCTACAGCCACTTCACTGTGTAACATTTTTAGGGGAAGTTACTTGGCTGGAGCTTCTCTAATCTACACAGTAACAAGAAACGCTGGTTTGGTCAAGAGACGTGGGGGTGGAACAGGAGAGACCCTGGGAGAGAAATGGAAGGCCCGCTTGAACCAGATGTCGGCCCTGGAGTTCTACTCCTACAAAAAGTCAGGCATCACCGAGGTGTGCAGAGAAGAGGCCCGCCGCGCCCTCAAGGACGGTGTGGCAACGGGAGGCCATGCTGTGTCCCGAGGAAGTGCAAAGCTGAGATGGTTGGTGGAGCGGGGATACCTGCAGCCCTATGGAAAGGTCATTGATCTTGGATGTGGCAGAGGGGGCTGGAGTTACTACGCCGCCACCATCCGCAAAGTTCAAGAAGTGAAAGGATACACAAAAGGAGGCCCTGGTCATGAAGAACCCATGTTGGTGCAAAGCTATGGGTGGAACATAGTCCGTCTTAAGAGTGGGGTGGACGTCTTTCATATGGCGGCTGAGCCGTGTGACACGTTGCTGTGTGACATAGGTGAGTCATCATCTAGTCCTGAAGTGGAAGAAGCACGGACGCTCAGAGTCCTCTCCATGGTGGGGGATTGGCTTGAAAAAAGACCAGGAGCCTTTTGTATAAAAGTGTTGTGCCCATACACCAGCACTATGATGGAAACCCTGGAGCGACTGCAGCGTAGGTATGGGGGAGGACTGGTCAGAGTGCCACTCTCCCGCAACTCAACACATGAGATGTACTGGGTCTCTGGAGCGAAAAGCAACACCATAAAAAGTGTGTCCACCACGAGCCAGCTCCTCTTGGGGCGCATGGACGGGCCCAGGAGGCCAGTGAAATATGAGGAGGATGTGAATCTCGGCTCTGGCACGCGGGCTGTGGTAAGCTGCGCTGAAGCTCCCAACATGAAGATCATTGGTAACCGCATTGAAAGGATCCGCAGTGAGCACGCGGAAACGTGGTTCTTTGACGAGAACCACCCATATAGGACATGGGCTTACCATGGAAGCTATGAGGCCCCCACACAAGGGTCAGCGTCCTCTCTAATAAACGGGGTTGTCAGGCTCCTGTCAAAACCCTGGGATGTGGTGACTGGAGTCACAGGAATAGCCATGACCGACACCACACCGTATGGTCAGCAAAGAGTTTTCAAGGAAAAAGTGGACACTAGGGTGCCAGACCCCCAAGAAGGCACTCGTCAGGTTATGAGCATGGTCTCTTCCTGGTTGTGGAAAGAGCTAGGCAAACACAAACGGCCACGAGTCTGTACCAAAGAAGAGTTCATCAACAAGGTTCGTAGCAATGCAGCATTAGGGGCAATATTTGAAGAGGAAAAAGAGTGGAAGACTGCAGTGGAAGCTGTGAACGATCCAAGGTTCTGGGCTCTAGTGGACAAGGAAAGAGAGCACCACCTGAGAGGAGAGTGCCAGAGCTGTGTGTACAACATGATGGGAAAAAGAGAAAAGAAACAAGGGGAATTTGGAAAGGCCAAGGGCAGCCGCGCCATCTGGTATATGTGGCTAGGGGCTAGATTTCTAGAGTTCGAAGCCCTTGGATTCTTGAACGAGGATCACTGGATGGGGAGAGAGAACTCAGGAGGTGGTGTTGAAGGGCTGGGATTACAAAGACTCGGATATGTCCTAGAAGAGATGAGTCGCATACCAGGAGGAAGGATGTATGCAGATGACACTGCTGGCTGGGACACCCGCATCAGCAGGTTTGATCTGGAGAATGAAGCTCTAATCACCAACCAAATGGAGAAAGGGCACAGGGCCTTGGCATTGGCCATAATCAAGTACACATACCAAAACAAAGTGGTAAAGGTCCTTAGACCAGCTGAAAAAGGGAAGACAGTGATGGACATTATTTCAAGACAAGACCAAAGGGGGAGCGGACAAGTTGTCACTTACGCTCTCAACACATTTACCAACCTAGTGGTGCAACTCATTCGGAATATGGAGGCTGAGGAAGTTCTAGAGATGCAAGACTTGTGGCTGCTGCGGAGGTCAGAGAAAGTGACCAACTGGTTGCAGAGCAACGGATGGGATAGGCTCAAACGAATGGCAGTCAGTGGAGATGATTGCGTTGTGAAGCCAATTGATGATAGGTTTGCACATGCCCTCAGGTTCTTGAATGATATGGGGAAAGTTAGGAAGGACACACAAGAATGGAAACCCTCAACTGGATGGGACAACTGGGAAGAAGTTCCGTTTTGCTCCCACCATTTCAACAAGCTCCATCTCAAGGACGGGAGGTCCATTGTGGTTCCCTGCCGCCACCAAGATGAACTGATTGGCCGGGCCCGCGTCTCTCCAGGGGCGGGATGGAGCATCCGGGAGACTGCTTGCCTAGCAAAATCATATGCGCAGATGTGGCAGCTCCTTTATTTCCACAGAAGGGACCTCCGACTGATGGCCAATGCCATTTGTTCATCTGTGCCAGTTGACTGGGTTCCAACTGGGAGAACTACCTGGTCAATCCATGGAAAGGGAGAATGGATGACCACTGAAGACATGCTTGTGGTGTGGAACAGAGTGTGGATTGAGGAGAACGACCACATGGAAGACAAGACCCCAGTTACGAAATGGACAGACATTCCCTATTTGGGAAAAAGGGAAGACTTATGGTGTGGATCTCTCATAGGGCACAGACCGCGCACCACCTGGGCTGAGAACATTAAAAACACAGTCAACATGGTGCGCAGAATCATAGGTGATGAAGAAAAGTACATGGACTACCTATCCACCCAAGTTCGCTACTTGGGTGAAGAAGGGTCTACACCTGGAGTGCTGTAAGCACCAATCTTAGTGTTGTCAGGCCTGCTAGTCAGCCACAGCTTGGGGAAAGCTGTGCAGCCTGTGACCCCCCCAGGAGAAGCTGGGAAACCAAGCCCATAGTCAGGCCGAGAACGCCATGGCACGGAAGAAGCCATGCTGCCTGTGAGCCCCTCAGAGGACACTGAGTCAAAAAACCCCACGCGCTTGGAGGCGCAGGATGGGAAAAGAAGGTGGCGACCTTCCCCACCCTTCAATCTGGGGCCTGAACTGGAGATCAGCTGTGGATCTCCAGAAGAGGGACTAGTGGTTAGAGGAGACCCCCCGGAAAACGCAAAACAGCATATTGACGCTGGGAAAGACCAGAGACTCCATGAGTTTCCACCACGCTGGCCGCCAGGCACAGATCGCCGAATAGCGGCGGC"
df = feature_extraction_pipeline(dna)
df.head()

In [None]:
df = scaler.transform(df)

In [None]:
model['Best Model'].predict_proba(df)

In [None]:
pip install biopython

In [None]:
import matplotlib as plt
import seaborn as sns
from Bio import SeqIO
zika_sequences = []
with open("zika_sequences.fasta", "r") as fasta_file:
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        zika_sequences.append(str(seq_record.seq))
chikungunya_sequences = []
with open("chikungunya_sequences.fasta", "r") as fasta_file:
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        chikungunya_sequences.append(str(seq_record.seq))
influenza_a_sequences = []
with open("influenza_a_sequences.fasta", "r") as fasta_file:
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        influenza_a_sequences.append(str(seq_record.seq))
influenza_b_sequences = []
with open("influenza_b_sequences.fasta", "r") as fasta_file:
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        influenza_b_sequences.append(str(seq_record.seq))
influenza_c_sequences = []
with open("influenza_c_sequences.fasta", "r") as fasta_file:
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        influenza_c_sequences.append(str(seq_record.seq))
train_df = pd.DataFrame(columns=['GC', 'AT', 'Molecular Wt', 'Hydrophobic Score', 'Hydrophilic Score', 'Sequence Entropy', 'Virus'])
for seq in zika_sequences:
    seq = seq.replace('\n', '')
    words = Kmers_contribution(seq)
    joined_words = ' '.join(words)
    gc = gc_content(seq)
    at = at_content(seq)
    mw = molecular_weight(seq)
    hydrophobic_score, hydrophilic_score = hydrophobicity(seq)
    s_entropy = sequence_entropy(seq)
    train_df.loc[len(train_df)] = [gc, at, mw, hydrophobic_score, hydrophilic_score, s_entropy, "Zika"]
for seq in chikungunya_sequences:
    seq = seq.replace('\n', '')
    words = Kmers_contribution(seq)
    joined_words = ' '.join(words)
    gc = gc_content(seq)
    at = at_content(seq)
    mw = molecular_weight(seq)
    hydrophobic_score, hydrophilic_score = hydrophobicity(seq)
    s_entropy = sequence_entropy(seq)
    train_df.loc[len(train_df)] = [gc, at, mw, hydrophobic_score, hydrophilic_score, s_entropy, "Chikungunya"]
for seq in influenza_a_sequences:
    seq = seq.replace('\n', '')
    words = Kmers_contribution(seq)
    joined_words = ' '.join(words)
    gc = gc_content(seq)
    at = at_content(seq)
    mw = molecular_weight(seq)
    hydrophobic_score, hydrophilic_score = hydrophobicity(seq)
    s_entropy = sequence_entropy(seq)
    train_df.loc[len(train_df)] = [gc, at, mw, hydrophobic_score, hydrophilic_score, s_entropy, "Influenza A"]
for seq in influenza_b_sequences:
    seq = seq.replace('\n', '')
    words = Kmers_contribution(seq)
    joined_words = ' '.join(words)
    gc = gc_content(seq)
    at = at_content(seq)
    mw = molecular_weight(seq)
    hydrophobic_score, hydrophilic_score = hydrophobicity(seq)
    s_entropy = sequence_entropy(seq)
    train_df.loc[len(train_df)] = [gc, at, mw, hydrophobic_score, hydrophilic_score, s_entropy, "Influenza B"]
for seq in influenza_c_sequences:
    seq = seq.replace('\n', '')
    words = Kmers_contribution(seq)
    joined_words = ' '.join(words)
    gc = gc_content(seq)
    at = at_content(seq)
    mw = molecular_weight(seq)
    hydrophobic_score, hydrophilic_score = hydrophobicity(seq)
    s_entropy = sequence_entropy(seq)
    train_df.loc[len(train_df)] = [gc, at, mw, hydrophobic_score, hydrophilic_score, s_entropy, "Influenza C"]

train_df.sample(10)

In [None]:
df.head()

In [None]:
df = df[['GC', 'AT', 'Molecular Wt', 'Hydrophobic Score', 'Hydrophilic Score', 'Sequence Entropy']]
df.head()

In [None]:
df_ = df
df_['Virus'] = 'Unknown'
train_df = pd.concat([train_df, df_])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations

# Assuming 'train_df' is your DataFrame
features = ['GC', 'AT', 'Molecular Wt', 'Hydrophobic Score', 'Hydrophilic Score', 'Sequence Entropy']

# Generate all combinations of feature pairs
feature_combinations = combinations(features, 2)

# Create scatter plots for each combination
for feature1, feature2 in feature_combinations:
    plt.figure(figsize=(8, 6))  # Adjust size as needed
    sns.scatterplot(data=train_df[train_df['Virus'] != 'Unknown'],
                    x=feature1, y=feature2,
                    hue='Virus', style='Virus',
                    palette='deep')

    # Highlight 'Unknown' class
    unknown_points = train_df[train_df['Virus'] == 'Unknown']
    sns.scatterplot(data=unknown_points,
                    x=feature1, y=feature2,
                    color='red', marker='o', s=100, label='Unknown')

    # Set plot labels and title
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    plt.title(f'Scatterplot of {feature1} vs {feature2} for each Virus')
    plt.legend(loc='upper right')

    # Show the plot
    plt.show()


In [None]:
def compare_sequences(seq1, seq2):
    mutations = []
    min_len = min(len(seq1), len(seq2))
    for i in range(min_len):
        if seq1[i] != seq2[i]:
            mutations.append((i, seq1[i], seq2[i]))  # Position, original, mutated base
    return mutations

def is_frameshift(wild_type, mutated):
    if abs(len(wild_type) - len(mutated)) % 3 != 0:
        return True
    return False

def find_hotspots(wild_type, mutated, window_size=10, threshold=0.5):
    hotspots = []
    for i in range(len(wild_type) - window_size + 1):
        window_wt = wild_type[i:i+window_size]
        window_mut = mutated[i:i+window_size]
        mutation_rate = sum(1 for a, b in zip(window_wt, window_mut) if a != b) / window_size
        if mutation_rate >= threshold:
            hotspots.append((i, window_wt, window_mut, mutation_rate))
    return hotspots

def classify_mutation(wild_type, mutated):
    if wild_type == mutated:
        return "No mutation"
    if len(wild_type) == len(mutated):
        return "Point Mutation (Missense)"
    elif len(wild_type) > len(mutated):
        return "Deletion Mutation"
    elif len(wild_type) < len(mutated):
        return "Insertion Mutation"
    return "Complex Mutation"

def visualize_mutations(wild_type, mutated):
    mutations = compare_sequences(wild_type, mutated)
    positions = [mutation[0] for mutation in mutations]
    wild_bases = [mutation[1] for mutation in mutations]
    mutated_bases = [mutation[2] for mutation in mutations]

    plt.figure(figsize=(10, 2))
    # plt.scatter(positions, [1] * len(positions), color="red", label="Mutation")
    plt.plot(positions, wild_bases, 'go', label="Wild Type")
    plt.plot(positions, mutated_bases, 'bo', label="Mutated")
    plt.xlabel("Position")
    plt.ylabel("Base")
    plt.title("Mutations in DNA Sequence")
    plt.legend()
    plt.show()


In [None]:
wild_type = zika_sequences[2]
mutated = dna
mutations = compare_sequences(wild_type, mutated)
print(mutations)
print(is_frameshift(wild_type, mutated))
hotspots = find_hotspots(wild_type, mutated)
print(hotspots)
print(classify_mutation(wild_type, mutated))
visualize_mutations(wild_type, mutated)

In [None]:
pip install biopython ViennaRNA matplotlib requests py3Dmol biotite

In [None]:
from Bio.Seq import Seq
dna_seq = Seq(dna)
rna = dna_seq.transcribe()
protein = rna.translate()
print(protein)
def clean_sequence(seq, valid_chars):
    return ''.join([char for char in seq if char in valid_chars])
valid_amino_acids = "ARNDCEQGHILKMFPSTWYV"
protein = clean_sequence(protein, valid_amino_acids)
print(protein)

In [None]:
import py3Dmol
import requests
import biotite.structure.io as bsio

# Function to render the protein structure using py3Dmol
def render_mol(pdb):
    view = py3Dmol.view(width=800, height=500)
    view.addModel(pdb, 'pdb')
    view.setStyle({'cartoon': {'color': 'spectrum'}})
    view.setBackgroundColor('white')
    view.zoomTo()
    view.spin(True)
    return view

# Default protein sequence
DEFAULT_SEQ = protein
# DEFAULT_SEQ = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
# Protein sequence input
sequence = input("Enter your protein sequence (or press Enter for default): ").strip()
if not sequence:
    sequence = DEFAULT_SEQ

# Function to predict protein structure using ESMFold
def predict_structure(sequence):
    # Ensure the sequence length is at least 400
    if len(sequence) < 400:
        print("Sequence is too short. Please provide a sequence of at least 400 characters.")
        return

    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    best_b_value = -1  # Initialize with a low value
    best_pdb_string = None
    best_subsequence = None

    # Sliding window of 400 characters
    window_size = 400
    for i in range(0, len(sequence) - window_size + 1, 200):  # Overlapping windows with a shift of 200
        subsequence = sequence[i:i + window_size]

        # Make the request to the API
        response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', headers=headers, data=subsequence)

        if response.status_code != 200:
            print(f"Error fetching structure for subsequence starting at index {i}. Skipping.")
            continue

        pdb_string = response.content.decode('utf-8')

        # Save the PDB file temporarily
        with open('temp_predicted.pdb', 'w') as f:
            f.write(pdb_string)

        # Load structure for plDDT calculation
        struct = bsio.load_structure('temp_predicted.pdb', extra_fields=["b_factor"])
        b_value = round(struct.b_factor.mean(), 4)

        # Track the best structure with highest plDDT
        if b_value > best_b_value:
            best_b_value = b_value
            best_pdb_string = pdb_string
            best_subsequence = subsequence

    # If we found the best structure
    if best_pdb_string:
        # Save the best predicted structure
        with open('best_predicted.pdb', 'w') as f:
            f.write(best_pdb_string)

        # Render the best structure
        view = render_mol(best_pdb_string)

        # Display results
        print("\n--- Best Predicted Protein Structure ---")
        print(f"plDDT (confidence score): {best_b_value}")
        print("Best protein structure saved as 'best_predicted.pdb'.")
        return view
    else:
        print("No valid structure found.")
        return None

# Predict and visualize structure
view = predict_structure(sequence)
# if view:
#     view.show()


In [None]:
print(sequence[400:])

In [None]:
pip show joblib

Name: joblib
Version: 1.4.2
Summary: Lightweight pipelining with Python functions
Home-page: https://joblib.readthedocs.io
Author: 
Author-email: Gael Varoquaux <gael.varoquaux@normalesup.org>
License: BSD 3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: imbalanced-learn, librosa, mlxtend, music21, nltk, scikit-learn


In [None]:
pip install dill

Collecting dill
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m81.9/119.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.9


In [None]:
import dill

# Load the model from the dill file
with open('Viral_Gene_Prediction_Model_dill.pkl', 'rb') as file:
    dill_model = dill.load(file)

print("Model loaded using dill!")


Model loaded using dill!


In [None]:
import pandas as pd
import numpy as np
import joblib
import dill
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = dill_model['TF-IDF Vectorizer']
scaler = dill_model['Standard Scaler']
best_model = dill_model['Best Model']
model_list = [tfidf, scaler, best_model]
with open('Model_1.pkl', 'wb') as file:
    dill.dump(model_list, file)

In [None]:
dill_model

{'TF-IDF Vectorizer': TfidfVectorizer(),
 'Standard Scaler': StandardScaler(),
 'GC-content': <function __main__.gc_content(sequence)>,
 'AT-content': <function __main__.at_content(sequence)>,
 'Molecular Weight': <function __main__.molecular_weight(sequence)>,
 'Hydrophobic Score': <function __main__.hydrophobicity(sequence)>,
 'Sequence Entropy': <function __main__.sequence_entropy(seq)>,
 'K-mers Contribution': <function __main__.Kmers_contribution(seq, size=6)>,
 'Feature Extraction Pipeline': <function __main__.feature_extraction_pipeline(seq)>,
 'Transform Features': <function __main__.transform_features(seq)>,
 'Class Mapping': {'Chikungunya': 0,
  'Influenza A': 1,
  'Influenza B': 2,
  'Influenza C': 3,
  'Zika': 4},
 'Best Model': GradientBoostingClassifier()}

In [None]:
tfidf