# Calculate siRNA for Gene Sequences

This script takes gene sequences and calculates all possible siRNA interactions. An siRNA interaction site is defined as a 22-nt sequence of $$AWWWW-D--U------A-----$$ where $$D=[A,G,U],W=[A,U]$$ and - denotes a complement. 


The corresponding siRNA is defined as $$UWWWW-H--A------U-----$$ where $$H=[A,C,U]$$

In [1]:
import pandas as pd

In [3]:
def clean_data(filename):
    # Read in the file
    with open(filename+".txt", 'r') as file :
        filedata = file.readlines()

    outdata = ""
    for i in filedata:
        if '>' in i:
            outdata+="\n"+i
        else:
            outdata+=i.strip()
        
    # Write the file out again
    with open(filename+".txt", 'w') as file:
        file.write(outdata)
        
    # Read in the file
    with open(filename+".txt", 'r') as file :
        filedata = file.read()

    # Replace the target string
    filedata = filedata.replace('Sequence unavailable', 'Sequence_unavailable')

    # Write the file out again
    with open(filename+".txt", 'w') as file:
        file.write(filedata)
        file.write(filedata)
    

In [3]:
clean_data("Data/mart_export")

In [20]:
def approximate_seq(aprx_level, seq):
    #print(seq+" = "+str(seq.count('C')+seq.count('G')<=aprx_level))
    return seq.count('C')+seq.count('G')<=aprx_level
    

In [30]:

def is_site(sequence, site_type):
    aprx_level = 0
    match site_type:
        case "perfect-match":
            aprx_level = 0
        case "approximate-match1":
            aprx_level = 1
        case "approximate-match2":
            aprx_level = 2
        case "approximate-match3":
            aprx_level = 3
        case "approximate-match4":
            aprx_level = 4
        case _:
            print("Error: site_type = None")
    #takes 22-nt sequence and returns if the sequence is an exact site (AWWWW)
    if (sequence[0]=='A') and (approximate_seq(aprx_level,sequence[1:5])) and (sequence[6]=='A' or sequence[6]=='G' or sequence[6]=='T') and (sequence[9]=='T') and (sequence[16]=='A'):
        return True
    else:
        return False
    

In [31]:
def generate_siRNA(site):
    siRNA = ""
    for char in site:
        match char:
            case 'A':
                siRNA = siRNA + 'U'
            case 'T':
                siRNA = siRNA + 'A'
            case 'G':
                siRNA = siRNA + 'C'
            case 'C':
                siRNA = siRNA + 'G'
            case 'U':
                siRNA = siRNA + 'A'
    return siRNA[::-1]

In [39]:
def calc_siRNA(filename, aprx_level):
    gene_ids = []
    gene_sequences = []
    # Read the data from the text file
    with open(filename+'.txt', 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines), 2):  
            gene_id = lines[i].strip()
            sequence = lines[i+1].strip()  
            if sequence != 'Sequence_unavailable':
                gene_sequences.append(sequence)
                gene_ids.append(gene_id)
    # Create a DataFrame from the lists
    df = pd.DataFrame({'Gene ID': gene_ids, 'Gene Sequence': gene_sequences, 'siRNA Interaction Sites':None, 'Possible siRNA':None})
    #for each gene sequence, list siRNA sites
    for ind in df.index:
        sequence = df.loc[ind, "Gene Sequence"]
        siRNA_sites = []
        siRNAs = []
        #populate siRNA_sites with possible sites
        for i in range(0,len(sequence)-22):
            site = sequence[i:i+22]
            if(is_site(site, aprx_level)):
                siRNA_sites.append(site)
                siRNAs.append(generate_siRNA(site))
        df.loc[ind, "siRNA Interaction Sites"] = siRNA_sites
        df.loc[ind, "Possible siRNA"] = siRNAs
    df.to_csv("Data/output_"+aprx_level+".csv")
    p = sum(df.loc[ind, "siRNA Interaction Sites"]==[] for ind in df.index)/len(df.index)
    print(aprx_level +" : "+str(p))

In [44]:
calc_siRNA("Data/mart_export", "approximate-match4")

approximate-match4 : 0.011864309163163047
