## Retrieve the promoter sequences ##
#### IFNB1, IFIT1, MX1, TLR7 

In [24]:
import requests
import sys

# Set the server database to retrieve
server = "https://rest.ensembl.org"
headers = {"Accept": "application/json"}

# Set the Gene_names and ids
gene_ids = {
    "IFNB1": "ENSG00000171855",
    "IFIT1": "ENSG00000185745",
    "MX1": "ENSG00000157601",
    "TLR7": "ENSG00000196664"
}
# write a fasta file for promoter sequences
with open("Promoter_sequences.fasta", "w") as fasta_file:
    for gene_name, gene_id in gene_ids.items():
        # Set the endpoint to get 5' end expanded sequences
        endpoint = f"/sequence/id/{gene_id}?expand_5prime=3000"
        url = server + endpoint

        r = requests.get(url, headers=headers)

        if not r.ok:
            print(f"Error while retrieve {gene_name}: {r.status_code}")
            continue
        
        data = r.json() # Store extract data in json format
        sequence = data.get('seq', '') # extract the sequence from data in json format
        #print(len(sequence))
        trimmedsequence = sequence[:3000]
        #print(len(trimmedsequence))
        # Write the fasta file row
        fasta_file.write(f'>{gene_name}_{gene_id}\n{trimmedsequence}\n')
        
        # Print to check
        print(f">{gene_name}_{gene_id}\n{trimmedsequence}\n")



>IFNB1_ENSG00000171855
GTAGTTTCTTTTTTCTTTGAGAGCCATCATCACCATCATGGTTGACACCATGAACCTATCTGAAGATGTCAGCCATAGACTGCTTGATATTCTACAGGAAAGATCACAGTTTTAAGTGCAATCTACCCATGTTATTAGCAGTGTGTATCTTTCACACATTACACAGCCTCTCTAAGCCTCATTTCTCTCCTCTGTAAGATGGGGATGATAATAACCCATCTCAAATGTTTACTATGAGGATTATTCAAAGAATGGCAAATAGCAAGTGCTTAATAAATGATAACTAGTACTACCGCCACTACTGTTGTTTTTATTGTATTAGATTATGAACTCTCTAAGGACCATTTCCGGATGGAGGATAAGAGACCATTTGATGTGGGCAGTGATGAGGCCTTCTGTTGCACCTGGAAAGGTCAACTATATACAAGCCTGCAAGTCATTCTATAGGAGCAGGCCCCAGTGACCAGACTCTATAGACTGTCTCCTCTTTCCTGAGAGGGACAGCCATCTCTAGGTTGACTAACCTCTGAAGCTCCTTGCATTGGCTTTTGTGCTATGAGCCATGGATGATTCCAGACTAATCCGAGAATGCTCGTCAAAACCCCAAGGAATTACTCAAATACTGACATAACAGACATTTTTGAGTGGAAGAGCCGAGTTTTTTTTAATATTCTGAAACTCATTGTTTTTAAAATGCATGAGATGGCCAAGGTCTTGCTAAGAGCTGGCCTGCAAAGGCCAAAAGGCCAGAGAGAATGAAACCCATAGAGAGGCAGAATAACCAGAAAGGTTGGGACTCGTTTATTTTATAATGTAAATTAGTCTATTATGAAACAATACTTGTTTACTGGTGGAAAATTGGAAAATACAAAGAATAAAAGGAGGAAAAAAATCACTCTTTAGTTTCACAAGCCAAATCAAGCCACTATTAAAATGGTGGTTTACTTCCTTTTATTAATTTTCTCTACATATTTTTG

## Generate the csv files for matches (motif)
#### Use to find the exact match strategy in promoter regioin

In [79]:
from Bio import SeqIO
from Bio.Seq import Seq
import re
import csv

# Set the motif pattern
motif_sequence = "GAAACCGAAA"
reverse_motif_sequence = str(Seq(motif_sequence).reverse_complement())
forward_pattern = re.compile(r"GAAACCGAAA")
reverse_pattern = re.compile(r'TTTCGGTTTC')

# Empty list to store the match results
results = []

# Parse the promoter_sequences fasta file
# Extract the gene information, sequence then store at variable
for record in SeqIO.parse("Promoter_sequences.fasta", "fasta"):
    gene_info = record.id
    sequence = str(record.seq)
    rev_sequence = str(Seq(sequence).reverse_complement())

    # Split the gene info into gene_name and gene_id
    if "_" in gene_info:
        gene_name, gene_id = gene_info.split("_", 1)
    else:
        gene_name, gene_id = gene_info, "NA"

    # Find the matches in forward strand
    forward_match = list(forward_pattern.finditer(sequence)) # make a list to count the match numbers
    forward_position = [] # prepare the empty list to add match positions
    for match in forward_match:
        start = match.start()
        end = match.end()
        forward_position.append(f'+[{start}-{end}]')

    # Find the matches in reverse strand
    reverse_match = list(reverse_pattern.finditer(rev_sequence)) # make a list to count the match numbers
    reverse_position = [] # prepare the empty list to add match positions
    for match in reverse_match:
        start = match.start()
        end = match.end()
        reverse_position.append(f'-[{start}-{end}]')

    # Count total hits
    total_hits = len(forward_match) + len(reverse_match)
    all_positions = forward_position + reverse_position
    all_positions = " | ".join(all_positions)
    
    print(gene_name)
    print(total_hits)
    print(all_positions)

    # Add colum data into results list
    if total_hits > 0:
        results.append([
            gene_name, 
            gene_id, 
            motif_sequence, 
            total_hits, 
            all_positions, 
            sequence])
        
# Generate the csv file for hits
with open("motif_scan_results.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "Gene_name",
        "Gene_ID",
        "Motif_Sequence",
        "Number_of_Hits",
        "Position_of_Hits",
        "Promoter_Sequence"
    ])
    writer.writerows(results)

IFNB1
0

IFIT1
0

MX1
0

TLR7
0



## Search with regex
#### Not exactly match

In [80]:
from Bio import SeqIO
from Bio.Seq import Seq
import re
import csv

# Set the motif pattern
motif_sequence = "GAAACCGAAA"
reverse_motif_sequence = str(Seq(motif_sequence).reverse_complement())
forward_pattern = re.compile(r"GAAA[ATGC]{2}GAAA")
reverse_pattern = re.compile(r"TTTC[ATGC]{2}TTTC")

# Empty list to store the match results
results = []

# Parse the promoter_sequences fasta file
# Extract the gene information, sequence then store at variable
for record in SeqIO.parse("Promoter_sequences.fasta", "fasta"):
    gene_info = record.id
    sequence = str(record.seq)
    rev_sequence = str(Seq(sequence).reverse_complement())

    # Split the gene info into gene_name and gene_id
    if "_" in gene_info:
        gene_name, gene_id = gene_info.split("_", 1)
    else:
        gene_name, gene_id = gene_info, "NA"

    # Find the matches in forward strand
    forward_match = list(forward_pattern.finditer(sequence)) # make a list to count the match numbers
    forward_position = [] # prepare the empty list to add match positions
    for match in forward_match:
        start = match.start()
        end = match.end()
        forward_position.append(f'+[{start}-{end}]')

    # Find the matches in reverse strand
    reverse_match = list(reverse_pattern.finditer(rev_sequence)) # make a list to count the match numbers
    reverse_position = [] # prepare the empty list to add match positions
    for match in reverse_match:
        start = match.start()
        end = match.end()
        reverse_position.append(f'-[{start}-{end}]')

    # Count total hits
    total_hits = len(forward_match) + len(reverse_match)
    all_positions = forward_position + reverse_position
    all_positions = " | ".join(all_positions)
    
    print(gene_name)
    print(total_hits)
    print(all_positions)

    # Add colum data into results list
    if total_hits > 0:
        results.append([
            gene_name, 
            gene_id, 
            motif_sequence, 
            total_hits, 
            all_positions, 
            sequence])
        
# Generate the csv file for hits
with open("regex_motif_scan_results.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "Gene_name",
        "Gene_ID",
        "Motif_Sequence",
        "Number_of_Hits",
        "Position_of_Hits",
        "Promoter_Sequence"
    ])
    writer.writerows(results)
    
    

    


IFNB1
0

IFIT1
2
+[2697-2707] | -[293-303]
MX1
2
+[1920-1930] | -[1070-1080]
TLR7
0



## Find the matches in forward strand
#### Using two type of pattern - regex
#### forward pattern 'GAAANNGAAA'
#### reverse pattern 'TTTCNNTTTC'

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
import re
import csv

# Set the motif pattern
motif_sequence = "forward: GAAANNGAAA / reverse: TTTCNNTTTC"
forward_pattern = re.compile(r"GAAA[ATGC]{2}GAAA")
reverse_pattern = re.compile(r"TTTC[ATGC]{2}TTTC")

# Empty list to store the match results
results = []

# Parse the promoter_sequences fasta file
# Extract the gene information, sequence then store at variable
for record in SeqIO.parse("Promoter_sequences.fasta", "fasta"):
    gene_info = record.id
    sequence = str(record.seq)

    # Split the gene info into gene_name and gene_id
    if "_" in gene_info:
        gene_name, gene_id = gene_info.split("_", 1)
    else:
        gene_name, gene_id = gene_info, "NA"

    # Find the matches in forward strand
    forward_match = list(forward_pattern.finditer(sequence)) # make a list to count the match numbers
    forward_position = [] # prepare the empty list to add match positions
    for match in forward_match:
        start = match.start()
        end = match.end()
        forward_position.append(f'+[{start}-{end}]')

    # Find the matches in reverse strand
    reverse_match = list(reverse_pattern.finditer(sequence)) # make a list to count the match numbers
    reverse_position = [] # prepare the empty list to add match positions
    for match in reverse_match:
        start = match.start()
        end = match.end()
        reverse_position.append(f'-[{start}-{end}]')

    # Count total hits
    total_hits = len(forward_match) + len(reverse_match)
    all_positions = forward_position + reverse_position
    all_positions = " | ".join(all_positions)

    # Add colum data into results list
    if total_hits > 0:
        results.append([
            gene_name, 
            gene_id, 
            motif_sequence, 
            total_hits, 
            all_positions, 
            sequence])

# Generate the csv file for hits
with open("regex_motif_scan_forward_results.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "Gene_name",
        "Gene_ID",
        "Motif_Sequence",
        "Number_of_Hits",
        "Position_of_Hits",
        "Promoter_Sequence"
    ])
    writer.writerows(results)
    

    


[['IFIT1', 'ENSG00000185745', 'forward: GAAANNGAAA / reverse: TTTCNNTTTC', 4, '+[2697-2707] | -[1008-1018] | -[2961-2971] | -[2974-2984]', 'GCTGATAAGAGATTTTAGACTACAAGAGATGACCCTGGAAAAGACAGGGGAAATTTCTATGGAGCCCATTCCTTAATAAGTTGTGATTGGATATTTGAGACCCTTCAGCAGAAATGGCTTAGCATTTGTCCATATTATCATATTATAATTTTTAAAAAATACACTCAGAATAACATTCTTTTTCCATATCATCCTGTGAATGAAGAAGAGACTAAGGAATTCATTACCCATTGTGTAGGTGAGGAAACAAGGCATTGAGTCGTAAAATACTGAATCTGGGCCTCACAGAACAGGAAGTGGGTCCAGGGATTGAGTCCCATATCTCTCAGTCAGAGAATTTCAGACATACAATAGACATTAGGGATAATTGAGGCCCTGGTTCCCGGCTCAGGGCGCAGGTCTACATCAAAATCACCTGAAGATTTCTTTCTTTTTTTTTTTTTTTTTTTTTTTGAGATGGAGTTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGAAATGGCACGATCTCGGCTCACCACAACCTCCACCTCCCGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGCATGTGCCACCACGCCCGGCTAATTATGTATTTTTAGTAGAGACGGGGTTTCTCCATGTTAGTCAGGCTGGTCTCGAACTCCCAACCTCAGGTGATCCGCCCGTCTTGTCCTCCCAAAATGCTGGGATTACAGGCGTGAGCCACTGTGCCCAGGCTGAAGATTTCTTTTTAATACTGAGCCAGAGGCCTGTACCTAAGTATATGTTCAATTGTTCTGGAATGGGCATAGTTAGCATTATTTTTGAGATTTACCTGGCTGATTCTAATGTGCAGTG

## Find the matches in forward strand
#### Using two type of pattern - exact match
#### forward pattern 'GAAACCGAAA'
#### reverse pattern 'TTTCGGTTTC'

In [3]:
from Bio import SeqIO
from Bio.Seq import Seq
import re
import csv

# Set the motif pattern
motif_sequence = "forward: GAAACCGAAA / reverse: TTTCGGTTTC"
forward_pattern = re.compile(r"GAAACCGAAA")
reverse_pattern = re.compile(r"TTTCGGTTTC")

# Empty list to store the match results
results = []

# Parse the promoter_sequences fasta file
# Extract the gene information, sequence then store at variable
for record in SeqIO.parse("Promoter_sequences.fasta", "fasta"):
    gene_info = record.id
    sequence = str(record.seq)

    # Split the gene info into gene_name and gene_id
    if "_" in gene_info:
        gene_name, gene_id = gene_info.split("_", 1)
    else:
        gene_name, gene_id = gene_info, "NA"

    # Find the matches in forward strand
    forward_match = list(forward_pattern.finditer(sequence)) # make a list to count the match numbers
    forward_position = [] # prepare the empty list to add match positions
    for match in forward_match:
        start = match.start()
        end = match.end()
        forward_position.append(f'+[{start}-{end}]')

    # Find the matches in reverse strand
    reverse_match = list(reverse_pattern.finditer(sequence)) # make a list to count the match numbers
    reverse_position = [] # prepare the empty list to add match positions
    for match in reverse_match:
        start = match.start()
        end = match.end()
        reverse_position.append(f'-[{start}-{end}]')

    # Count total hits
    total_hits = len(forward_match) + len(reverse_match)
    all_positions = forward_position + reverse_position
    all_positions = " | ".join(all_positions)

    # Add colum data into results list
    if total_hits > 0:
        results.append([
            gene_name, 
            gene_id, 
            motif_sequence, 
            total_hits, 
            all_positions, 
            sequence])

# Generate the csv file for hits
with open("motif_scan_forward_results.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "Gene_name",
        "Gene_ID",
        "Motif_Sequence",
        "Number_of_Hits",
        "Position_of_Hits",
        "Promoter_Sequence"
    ])
    writer.writerows(results)
    

    


FileNotFoundError: [Errno 2] No such file or directory: 'Promoter_sequences.fasta'

jaspar module

In [None]:
# Import the packages

from pyjaspar import jaspardb
from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
import csv

# Fetch motifs from JASPAR database and return as Bio.motif object
def fetch_motifs_from_jaspar(tf_list, release="JASPAR2024"):

    jdb = jaspardb(release=release) # set the JASPAR object
    motif_dict = {} # create dict for storing motif

    for tf in tf_list: 
        motif_list = jdb.fetch_motifs_by_name(tf) # Fetch motifs from JASPAR by name
        if motif_list:
            jaspar_motif = motif_list[0] 
            motif_dict[tf] = jaspar_motif # Add to dict
            print(f" Fetched motif for {tf}: {jaspar_motif.matrix_id}") # Check the TF and ID
        else:
            print(f"No motif found for {tf}")

    return motif_dict

# Scan promoters (fasta file) for matches and include match
def scan_promoters_for_motifs(fasta_file, motif_dict, threshold=7.0):

    results = [] # Set the empty list to store match motif's information

    for record in SeqIO.parse(fasta_file, "fasta"): # Parse the fasta file
        gene_info = record.id # Extract the gene_info from fasta file
        promoter_seq = str(record.seq) # Extract the sequence data from fasta file
        reverse_seq = str(Seq(promoter_seq).reverse_complement()) # Reverse the sequence
        #print(len(promoter_seq))
        #print(len(reverse_seq))

        if "_" in gene_info:
            gene_name, gene_id = gene_info.split("_", 1)
        else:
            gene_name, gene_id = gene_info, "NA"

        for tf, motif in motif_dict.items():
            pssm = motif.pssm # Position-Specific Scoring Matrix
            motif_len = len(motif) # check the length of motif
            #print(motif_len)

            # Find the matches in forward strand
            forward_matches = list(pssm.search(promoter_seq, threshold=threshold))
            print(forward_matches)
            for pos, score in forward_matches:
                start = pos 
                end = pos + motif_len
                results.append([gene_name, gene_id, tf, "+", start, end]) # Add matched TF information to list

            # Find the matches in reverse strand
            reverse_matches = list(pssm.search(reverse_seq, threshold=threshold))
            print(reverse_matches)
            for pos, score in reverse_matches:
                start = len(promoter_seq) - pos - motif_len
                end = start + motif_len
                results.append([gene_name, gene_id, tf, "-", start, end]) # Add matched TF information to list

    return results

# Write results to CSV with start and end columns
def write_results_to_csv(results, filename="motif_scan_results.csv"):
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "Gene_Name", "Gene_ID", "TF", "Strand", "Start", "End"
        ]) # Assign the column name
        writer.writerows(results) # Write the row in csv file
    print(f"Results written to {filename}")

# Run functions
if __name__ == "__main__":
    tf_names = ["IRF3", "IRF7", "IRF9"] # Set the list to fetch multile transcription factors
    fasta_file = "Promoter_sequences.fasta" # Sequence data to scan

    motifs_dict = fetch_motifs_from_jaspar(tf_names) # Store function result in variable
    scan_results = scan_promoters_for_motifs(fasta_file, motifs_dict, threshold=7.0) # Store function result in variable
    write_results_to_csv(scan_results)


 Fetched motif for IRF3: MA1418.2
 Fetched motif for IRF7: MA0772.2
 Fetched motif for IRF9: MA0653.1
3000
3000
17
[(np.int64(2903), np.float32(7.022167))]
[(np.int64(-2920), np.float32(7.022167))]
13
[(np.int64(2888), np.float32(8.980127))]
[(np.int64(-2901), np.float32(8.980127))]
15
[]
[]
3000
3000
17
[(np.int64(-2827), np.float32(7.4404163)), (np.int64(2691), np.float32(9.355081)), (np.int64(-40), np.float32(13.019573)), (np.int64(-33), np.float32(7.6814137)), (np.int64(-27), np.float32(7.9259024))]
[(np.int64(10), np.float32(7.9259024)), (np.int64(16), np.float32(7.6814137)), (np.int64(23), np.float32(13.019573)), (np.int64(-2708), np.float32(9.355081)), (np.int64(2810), np.float32(7.4404163))]
13
[(np.int64(-1994), np.float32(14.6928)), (np.int64(1574), np.float32(8.584064)), (np.int64(-1164), np.float32(8.768341)), (np.int64(2590), np.float32(10.45305)), (np.int64(2696), np.float32(15.829915)), (np.int64(-41), np.float32(16.261517)), (np.int64(-28), np.float32(10.871973))]
[(np.

In [None]:
from pyjaspar import jaspardb
from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
import csv

# Fetch motifs from JASPAR database and return as Bio.motif object
def fetch_motifs_from_jaspar(tf_names, release="JASPAR2024"):

    jdb = jaspardb(release=release)
    motif = jdb.fetch_motifs_by_name(tf_names)

    if motif:
        jaspar_motif = motif[0]
        print(f" Fetched motif for {tf_names}: {jaspar_motif.matrix_id}")
        return jaspar_motif
    else:
        print(f"No motif found for {tf_names}")
        return None

# Scan promoters (fasta file) for matches and include match
def scan_promoters_for_motifs(fasta_file, motif_dict, threshold=7.0):

    results = []

    for record in SeqIO.parse(fasta_file, "fasta"):
        gene_info = record.id
        promoter_seq = str(record.seq)
        reverse_seq = str(Seq(promoter_seq).reverse_complement())
        print(len(promoter_seq))
        print(len(reverse_seq))

        if "_" in gene_info:
            gene_name, gene_id = gene_info.split("_", 1)
        else:
            gene_name, gene_id = gene_info, "NA"

            pssm = motif.pssm
            motif_len = len(motif)

            # Forward strand
            forward_matches = list(pssm.search(promoter_seq, threshold=threshold))
            print(forward_matches)
            for pos, score in forward_matches:
                start = pos
                end = pos + motif_len
                results.append([gene_name, gene_id, "IRF3", "+", start, end])

            # Reverse strand
            reverse_matches = list(pssm.search(reverse_seq, threshold=threshold))
            print(reverse_matches)
            for pos, score in reverse_matches:
                start = len(promoter_seq) - pos - motif_len
                if start < 0:
                    continue # skip invalid match
                end = start + motif_len
                results.append([gene_name, gene_id, "IRF3", "-", start, end])

    return results

# Write results to CSV with start and end columns
def write_results_to_csv(results, filename="motif_scan_IRF3.csv"):
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "Gene_Name", "Gene_ID", "TF", "Strand", "Start", "End"
        ])
        writer.writerows(results)
    print(f"Results written to {filename}")

# Run functions
if __name__ == "__main__":
    tf_names = "IRF3"
    fasta_file = "Promoter_sequences.fasta"

    motif = fetch_motifs_from_jaspar(tf_names)
    scan_results = scan_promoters_for_motifs(fasta_file, motifs_dict, threshold=7.0)
    write_results_to_csv(scan_results)


 Fetched motif for IRF3: MA1418.2
3000
3000
3000
3000
3000
3000
3000
3000
Results written to motif_scan_IRF3.csv


In [None]:
from pyjaspar import jaspardb
from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
import csv

# Fetch motifs from JASPAR database and return as Bio.motif object
def fetch_motifs_from_jaspar(tf_names, release="JASPAR2024"):

    jdb = jaspardb(release=release)
    motif = jdb.fetch_motifs_by_name(tf_names)

    if motif:
        jaspar_motif = motif[0]
        print(f" Fetched motif for {tf_names}: {jaspar_motif.matrix_id}")
        return jaspar_motif
    else:
        print(f"No motif found for {tf_names}")
        return None

# Scan promoters (fasta file) for matches and include match
def scan_promoters_for_motifs(fasta_file, motif_dict, threshold=7.0):

    results = []

    for record in SeqIO.parse(fasta_file, "fasta"):
        gene_info = record.id
        promoter_seq = str(record.seq)
        reverse_seq = str(Seq(promoter_seq).reverse_complement())
        print(len(promoter_seq))
        print(len(reverse_seq))

        if "_" in gene_info:
            gene_name, gene_id = gene_info.split("_", 1)
        else:
            gene_name, gene_id = gene_info, "NA"

            pssm = motif.pssm
            motif_len = len(motif)

            # Forward strand
            forward_matches = list(pssm.search(promoter_seq, threshold=threshold))
            print(forward_matches)
            for pos, score in forward_matches:
                start = pos
                end = pos + motif_len
                results.append([gene_name, gene_id, "IRF7", "+", start, end])

            # Reverse strand
            reverse_matches = list(pssm.search(reverse_seq, threshold=threshold))
            print(reverse_matches)
            for pos, score in reverse_matches:
                start = len(promoter_seq) - pos - motif_len
                if start < 0:
                    continue # skip invalid match
                end = start + motif_len
                results.append([gene_name, gene_id, "IRF7", "-", start, end])

    return results

# Write results to CSV with start and end columns
def write_results_to_csv(results, filename="motif_scan_IRF7.csv"):
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "Gene_Name", "Gene_ID", "TF", "Strand", "Start", "End"
        ])
        writer.writerows(results)
    print(f"Results written to {filename}")

# Run functions
if __name__ == "__main__":
    tf_names = "IRF7"
    fasta_file = "Promoter_sequences.fasta"

    motif = fetch_motifs_from_jaspar(tf_names)
    scan_results = scan_promoters_for_motifs(fasta_file, motifs_dict, threshold=7.0)
    write_results_to_csv(scan_results)


 Fetched motif for IRF7: MA0772.2
3000
3000
3000
3000
3000
3000
3000
3000
Results written to motif_scan_IRF7.csv


In [None]:
from pyjaspar import jaspardb
from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
import csv

# Fetch motifs from JASPAR database and return as Bio.motif object
def fetch_motifs_from_jaspar(tf_names, release="JASPAR2024"):

    jdb = jaspardb(release=release)
    motif = jdb.fetch_motifs_by_name(tf_names)

    if motif:
        jaspar_motif = motif[0]
        print(f" Fetched motif for {tf_names}: {jaspar_motif.matrix_id}")
        return jaspar_motif
    else:
        print(f"No motif found for {tf_names}")
        return None

# Scan promoters (fasta file) for matches and include match
def scan_promoters_for_motifs(fasta_file, motif_dict, threshold=7.0):

    results = []

    for record in SeqIO.parse(fasta_file, "fasta"):
        gene_info = record.id
        promoter_seq = str(record.seq)
        reverse_seq = str(Seq(promoter_seq).reverse_complement())
        print(len(promoter_seq))
        print(len(reverse_seq))

        if "_" in gene_info:
            gene_name, gene_id = gene_info.split("_", 1)
        else:
            gene_name, gene_id = gene_info, "NA"

            pssm = motif.pssm
            motif_len = len(motif)

            # Forward strand
            forward_matches = list(pssm.search(promoter_seq, threshold=threshold))
            print(forward_matches)
            for pos, score in forward_matches:
                start = pos
                end = pos + motif_len
                results.append([gene_name, gene_id, "IRF9", "+", start, end])

            # Reverse strand
            reverse_matches = list(pssm.search(reverse_seq, threshold=threshold))
            print(reverse_matches)
            for pos, score in reverse_matches:
                start = len(promoter_seq) - pos - motif_len
                if start < 0:
                    continue # skip invalid match
                end = start + motif_len
                results.append([gene_name, gene_id, "IRF9", "-", start, end])

    return results

# Write results to CSV with start and end columns
def write_results_to_csv(results, filename="motif_scan_IRF9.csv"):
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "Gene_Name", "Gene_ID", "TF", "Strand", "Start", "End"
        ])
        writer.writerows(results)
    print(f"Results written to {filename}")

# Run functions
if __name__ == "__main__":
    tf_names = "IRF9"
    fasta_file = "Promoter_sequences.fasta"

    motif = fetch_motifs_from_jaspar(tf_names)
    scan_results = scan_promoters_for_motifs(fasta_file, motifs_dict, threshold=7.0)
    write_results_to_csv(scan_results)


 Fetched motif for IRF9: MA0653.1
3000
3000
3000
3000
3000
3000
3000
3000
Results written to motif_scan_IRF9.csv


## Find only on forward strand

In [76]:
from pyjaspar import jaspardb
from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
import csv

# Fetch motifs from JASPAR
def fetch_motifs_from_jaspar(tf_list, release="JASPAR2024"):
    jdb = jaspardb(release=release) # set the JASPAR object
    motif_dict = {} # Create empty dict

    # Iterate through transcription factors
    for tf in tf_list:
        motif_list = jdb.fetch_motifs_by_name(tf) 
        if motif_list:
            jaspar_motif = motif_list[0] # Store 0 postion of motif_list to variable
            motif_dict[tf] = jaspar_motif # Add to dict
            print(f"Fetched motif for {tf}: {jaspar_motif.matrix_id}") # Print TF and TF id to confirm
        else:
            print(f"No motif found for {tf}") # Print this message when there is no motif information.

    return motif_dict

# Scan promoter regions using both orientations of motif on forward strand
def scan_promoters_for_motifs(fasta_file, motif_dict, threshold=7.0):
    results = []

    # Parse the fasta file which contain promoter sequences
    for record in SeqIO.parse(fasta_file, "fasta"):
        gene_info = record.id # Extract the gene information
        promoter_seq = str(record.seq) # Extract the sequences

        # Extract the gene_name and gene_id from fasta file
        if "_" in gene_info:
            gene_name, gene_id = gene_info.split("_", 1)
        else:
            gene_name, gene_id = gene_info, "NA"

        # Iterate through motif_dict
        for tf, motif in motif_dict.items():
            motif_len = len(motif)

            # Get both forward and reverse PSSMs
            pssm_fwd = motif.pssm

            # Search with forward PSSM
            forward_matches = list(pssm_fwd.search(promoter_seq, threshold=threshold)) # Scan Promoter sequences with forward motif patterns
            for pos, score in forward_matches: # Iterate through matches
                start = pos 
                end = pos + motif_len
                if start < 0 or end > len(promoter_seq):
                    continue # Skip when start value is negative or end value is over the promoter sequence length.
                results.append([gene_name, gene_id, tf, "+", start, end]) # Add matched data into empty list to write CSV file
            
    return results

# Save results to CSV
def write_results_to_csv(results, filename="motif_scan_results_threshold_1.0.csv"): 
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "Gene_Name", "Gene_ID", "TF", "Strand", "Start", "End"
        ]) # Write header row
        writer.writerows(results) # Write the rows in CSV file
    print(f"Results written to {filename}") # Print this message to confirm

# Run functions
if __name__ == "__main__":
    tf_names = ["IRF3", "IRF7", "IRF9"] # Set the list to fetch multile transcription factors
    fasta_file = "/Users/sang/bioinformatics_msc_project_2025/github/TFBS-explorer/outputs/Promoter_sequences.fasta" # Sequence data to scan

    motifs_dict = fetch_motifs_from_jaspar(tf_names) # Store function result in variable
    scan_results = scan_promoters_for_motifs(fasta_file, motifs_dict, threshold=1.0) # Store function result in variable
    write_results_to_csv(scan_results)

Fetched motif for IRF3: MA1418.2
Fetched motif for IRF7: MA0772.2
Fetched motif for IRF9: MA0653.1
Results written to motif_scan_results_threshold_1.0.csv


# Playground

- try using only forward strand and only forward profile

In [4]:
ifnb1 = "GTAGTTTCTTTTTTCTTTGAGAGCCATCATCACCATCATGGTTGACACCATGAACCTATCTGAAGATGTCAGCCATAGACTGCTTGATATTCTACAGGAAAGATCACAGTTTTAAGTGCAATCTACCCATGTTATTAGCAGTGTGTATCTTTCACACATTACACAGCCTCTCTAAGCCTCATTTCTCTCCTCTGTAAGATGGGGATGATAATAACCCATCTCAAATGTTTACTATGAGGATTATTCAAAGAATGGCAAATAGCAAGTGCTTAATAAATGATAACTAGTACTACCGCCACTACTGTTGTTTTTATTGTATTAGATTATGAACTCTCTAAGGACCATTTCCGGATGGAGGATAAGAGACCATTTGATGTGGGCAGTGATGAGGCCTTCTGTTGCACCTGGAAAGGTCAACTATATACAAGCCTGCAAGTCATTCTATAGGAGCAGGCCCCAGTGACCAGACTCTATAGACTGTCTCCTCTTTCCTGAGAGGGACAGCCATCTCTAGGTTGACTAACCTCTGAAGCTCCTTGCATTGGCTTTTGTGCTATGAGCCATGGATGATTCCAGACTAATCCGAGAATGCTCGTCAAAACCCCAAGGAATTACTCAAATACTGACATAACAGACATTTTTGAGTGGAAGAGCCGAGTTTTTTTTAATATTCTGAAACTCATTGTTTTTAAAATGCATGAGATGGCCAAGGTCTTGCTAAGAGCTGGCCTGCAAAGGCCAAAAGGCCAGAGAGAATGAAACCCATAGAGAGGCAGAATAACCAGAAAGGTTGGGACTCGTTTATTTTATAATGTAAATTAGTCTATTATGAAACAATACTTGTTTACTGGTGGAAAATTGGAAAATACAAAGAATAAAAGGAGGAAAAAAATCACTCTTTAGTTTCACAAGCCAAATCAAGCCACTATTAAAATGGTGGTTTACTTCCTTTTATTAATTTTCTCTACATATTTTTGCATAATCATGTTGTATGTACAATTTTATGTTCTATTTTTCAATATTAACTGGTGTCTTTCAAATTTCCTAATGACAAAAATAATATATGCTCATAATAGAACATTTTAAATGCAAATAAAACAAAATAAATGTTAAAATTTAGTAATATTTATTAAATTTTCTCCAAGTGCACGAAATTACAAATGTAACAACCTAATTCCCTAGTGGCCTAATAACCCTATTTCCCAGACCTCTTCTCATTACAAGGAAAAACTCATATGCAGATAGTTCTAAAGGTATGAAGTGAAAAGATAAAGATTTTTCTTCCTTGCTGCATCCTCACCCCATCAGCATTATTCCCCAGGGTAACTACTATTAATAGATAGTAATTCTACCCAAAGGAAAAAATCATATGCATATAACAGCATCATATGTATACCTTTCTAGTAACTTACAAAACAAATGATAATATCATATCCTTTCTTATGTGTATTGCTCTTTTCACTAAATGTATCTGTGATATGTGTCTATATCAGCTGATTGTCCTTTTTGATGGCTGAATAATATTCCATCTTGTCCACGTGATAGTATTACTTGACAAGCTCCCTGCTGATGGACATTTGTCTTTGTTACTATGATAGTAATATAATCAACATTTATATATGTTTTGTATGTATCTATAATACACATGCACATACACATGCATATTTCTGCAGGGATAGCCATAGTAAATAACTAGTAACGGTATTGCAAGTTAAAGGAACAATCTCATTGCTTGAAATTTTAAATTTTGAAATACACTGCCAATTTTCATGGTCTCTCCTTGTAAGCTAGTTTGGGCTTTCTCACAGCATGACAGGCTCAGGGCAGTCAGACCATCCTGGCCAAAGAGCAGAGTGCCACAGACCACAACTGCTTCTAATCAGCCATCTTCCCAAAGCCTTCTCTTTTTTCTATTAATAACTTTGTATGAGATTCCATCTTAATACTTTTCTGTTGTTTGGTCTTGTAAGAGCTTATTTTTCCTGAACCAGGAAGTGGTTCAGGGCGGTTTTTCTAACTTCACAGAGCTCCCTCTTCTGTTAGCTTTTGTGAAATGGTCAAAAACATAGCAGCCTGCCTTCTGAGTTCTCCATCCCACCCTGGTTGGGCCTTCTCTATCCTTGTCTGTGTTGTTTATATCCTGCTGAAGTGTGATTCCACTTGTGGCAGTTTCTCCTCTGTGTAGGATCAAAAGGGCTGTGACTGGTTGGTTTGAAAATTTCTTATACCCTAGACTATTCCAGTGCCTTTCAGAAGTTTCCAAGGCCCTCTCACACTAATCTATTATCATATTGGGCAAAACTCCTTGCAGTTTCAGCTACTATTCCCTGATTGACTTTTCAGTAAATCTATCTCTCAGTCTTTCAGTATCCAAAGAAGATTGGTTCTAGGACCACCATCCCGCTGCCTCCACAGATACCAAAATCAGAGGATGCTCAATTCCCTCTTATAAAACGTTGCAGTATTTGCATATAATCTGCACATGTATTTCTGTATATTTTAAATCATCCCTAGATTACTTATAATACCTGATACAATATAAATGCTAAATAGCTGTAACACTGTATCTTTAAAATTTACATTATTTTTTGTTGTTGTATTATTATTTTTATTGTATTTTTAAAAAATATTTTCCATCTACAGTCAGTAGAATCCACGGATACAGAACCTATGGATAGGAAGGACCAACTGTATCTTTTAGTGTTTTGAGGTTCTTGAATTCTCAGGTCGTTTGCTTTCCTTTGCTTTCTCCCAAGTCTTGTTTTACAATTTGCTTTAGTCATTCACTGAAACTTTAAAAAACATTAGAAAACCTCACAGTTTGTAAATCTTTTTCCCTATTATATATATCATAAGATAGGAGCTTAAATAAAGAGTTTTAGAAACTACTAAAATGTAAATGACATAGGAAAACTGAAAGGGAGAAGTGAAAGTGGGAAATTCCTCTGAATAGAGAGAGGACCATCTCATATAAATAGGCCATACCCATGGAGAAAGGAC"

In [45]:
ifit1 = 'GCTGATAAGAGATTTTAGACTACAAGAGATGACCCTGGAAAAGACAGGGGAAATTTCTATGGAGCCCATTCCTTAATAAGTTGTGATTGGATATTTGAGACCCTTCAGCAGAAATGGCTTAGCATTTGTCCATATTATCATATTATAATTTTTAAAAAATACACTCAGAATAACATTCTTTTTCCATATCATCCTGTGAATGAAGAAGAGACTAAGGAATTCATTACCCATTGTGTAGGTGAGGAAACAAGGCATTGAGTCGTAAAATACTGAATCTGGGCCTCACAGAACAGGAAGTGGGTCCAGGGATTGAGTCCCATATCTCTCAGTCAGAGAATTTCAGACATACAATAGACATTAGGGATAATTGAGGCCCTGGTTCCCGGCTCAGGGCGCAGGTCTACATCAAAATCACCTGAAGATTTCTTTCTTTTTTTTTTTTTTTTTTTTTTTGAGATGGAGTTTCACTCTTGTTGCCCAGGCTGGAGTGCAATGAAATGGCACGATCTCGGCTCACCACAACCTCCACCTCCCGGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGATTACAGGCATGTGCCACCACGCCCGGCTAATTATGTATTTTTAGTAGAGACGGGGTTTCTCCATGTTAGTCAGGCTGGTCTCGAACTCCCAACCTCAGGTGATCCGCCCGTCTTGTCCTCCCAAAATGCTGGGATTACAGGCGTGAGCCACTGTGCCCAGGCTGAAGATTTCTTTTTAATACTGAGCCAGAGGCCTGTACCTAAGTATATGTTCAATTGTTCTGGAATGGGCATAGTTAGCATTATTTTTGAGATTTACCTGGCTGATTCTAATGTGCAGTGAGGGTTGAGATCTGCCAACCTATTCCAAATGACTTCTCAGCTGAGGAAATTGAGATTCATAGAGGTGAAGCAGTTTATCATAACTGGTCAGTCCCTTGCAGAGCCCAGCTTAGACTCCTGGGAATGAACTCACAACCTAGTGCTGAAATTTCATTTTCTAAATGTTTGATCCTCTCTGGTTCCCATTTTCTTTTTTATCTGTAAACACACACACACACGCGCGCGATTGGCTTTTCTTTCCTTAATGCTGTTCTATCCTGTGTCCCTCATGGGTTTGTTTAGCCCAAAGAGACACTAAGCAAGATTCTGAATACCTAATCAGCCTGGGCCATAGCCCCGTAGAACCAAGACTGCTGCCAAATTCACAGCAATGGCAAGATAAACTTTTTGTTCATATCTCTGCATGCTATCTAGTGCATTGGCAAATTGAAAACCCATTTTTTCCCATAAGAATTCCAAAGACATCTTGCTCCCCAAAAGATGTTTTCTAATTCATCTGTTTCAGCATTTGCTTATGGCCTGATCATGTATCTGAGTGCAAGACAAAACCTGTCCTATTAATTCAGAAATTCTTATTTGTGTAGAACTTTCTAACTAGTTGTTGAACTCTCTGTGGAAGCTTACAGAATTGGTGAGAGCTGACATGGGAGATGAGTCTGGCAGGGAGTTGAGCAGAAACTGTTTAGATCTGGCAGTGTGCCCAGTTAAGTGTTTCCAAAAGGAAAATACTCTTGCACCTGCTCCTCTTCTGCTTCTTATGTTAAAAGACAAAGTAGGTAGGTTTTTTTAAAGGAAACATTGTTTTTATATATTGTTATGATTATTTGAGACAGGGGCTCACTCTGTTGCCCAGGCTGGAGTGCAGTGGTGTGACCATGGCTCACTGCAGCCTCTACCTCCTGGGCTCAAGTGAGCCTCCCACCTCAGCCTCCTGAGTAGCTGAGACCACAGACACACACCACCACACCCAGCTTTTTTTTTCAGTTTTTTTGTAGAGTTGGGGTCTCCCTATGTTGTCCAGGCTGGTCTACAATTCCTGGGCTCAAGTGATCCTCCCGCCTTGCCTTCCAAAATACTGGGACTTACAGGCCTGAGCCAATGCACCCGGACAACAATTTTTTATTTTTTATTTATTTATTTTTTTAGACGGAGTCTCGCTCTGTCACCAGACTGGAGTACAGTGGTGTGATCTCGGCTCACTGCAACCTCTGCCTCCCAGGTTCAAGCAATTCCCCTGCCTCAGCCTCTCGAGTAGGTGGGACTACAGGTGCACACCACCACACCCAGCTAATTTTTTGTATTTTAGTAGAGAGGGGGTTTCACCATGTTGGCCACGATGGTCTCCATCTCCTGACCTTGTCATCCGCCCACCTTGGCCTCCCAAAGTGCTGGGACTACAGGCATGAGCCACCGCACCCAGCCAAGAATCATTATTTTTAACTTGATGACTGAAAATAATAATAATAATAGTTACCACTTATTTGCATGCTTCTATGTGCCAGGTAGTTGCTAACTATTTAAACTCAAATTCCATGAACTGTAGTGGAGGTTGTACTGGAATTTGATTCAGAATGACAGTGTCCATGATGGAGCAATAGAGGGCTCTCTATTTCAAACCATACCTCCTTGCTTTTACCTCCTGCCTAAGTCATCAGGGGTTAGAAGGCTTTCTAGGTATTGGTCTCTTTCCTTCATTCCTAAACCAGATTGGTTGCTTATTTCCGTCAAGCTGAAACCAAAAGTAAGCAACCAAAAAGCAACCAGCAACCAAAAGCCTTGTTACTCAATTAATTAAGAGTAGATTTTTATATTTGATAGTAGGTTCCTTCTAAATATAGAAACTGAAAATAGAGCTATCTCCTTCAATTCTCCTTTTTCTGTGTATTCATCCAGAATCCAGCCACCAACTGCCACAATAGGCAGCAATGGACTGATGTTCTTTAGGGAGGACGTGAATCTCGTTCCAAATGCTGGCCAGTCATTGGGTTTCTGCAGCACTAGAAACATCTATGGTTGCAGGTCTGCAGTTTATCTGTTTTAAAATAGAAACAAAGTTTCATTCCCCACCCCCCCCCGTCAGCAGGAATTCCGCTAGCTTTAGTTTCACTTTCCCCTTTCGGTTTCCCTAGGTTTCCAACTT'

In [46]:
MX1 = 'AATGAAATAATGGCATTCACAGCAACCTGGATGGAGTTGGAGACCATTATTCTAAGTGAAGTAACTCGGGAATAAAAACCAAACATTGTATATTCTCACTCATAAGTGAGATTATACAAGCTATGAGGATGCAAAGTCATAAGAATGATACAATAAACATTGGGAACTCAGGGGGAAGGGTGAGGGTGGGGTGAGGGATAAAAGACTAGACATCGGGTACAGTATACACTGCTTGGGTGATGGGTGCACCAAAATCCCAGAAATCACGATTAAAGAACTTGTCAATGTAACCAAACACCACCTGTTCCTCAAAAACTATTGAAATAAAAAATAAATACATAAAAGAAGTTGAGGTTAAATTAATCTAAGTAGGGAGTGCATTTGGGCCAAGCTTGAGGATTGCAACGTTGGAGCACAGATTTAAGTTGCCCTGAATCTACACTCTGATAAGTGGCATTTATAAGTGGATTTTTAAAGGCAAAAGGGGGAGACAGGGAGAGGGCTGATACAAAGCTGTTTGTCAGGAATTCTTATTGGTTTACAGAAATAACATTGATAAGTGCTTATTAGTCCATTTTCATGCTGCTGATAAAGACATACCTCAGACTGGGCAATTTACAAAAGAAAGAGGTTTATTGGACCTATAGTTCCACATGGCTGAGGAGGCCTCACAATCATGGTGGAAGGCAAGGAGGAGCAAGTCACATCTTACATGGATGACAGCAGGAAAAAAGAGAGAGCTTGTGCAGAGAAACTCCCTTTTTTAAAACCATCAGATCTTGTGAGACCCGTTCAGGATCATGAGAACAGCACAGGAAAGACCCACCCCCAAGATTCAATCATCTCCCACCGGGTCTCTCCCACAACATGAGTGAATTATGGGAGCTACAGGATGAGATTTGGGTGAGGACACAGAGCCAAACCATATCAGAGTGATTGACTATACATTGTTAAGCTACAGGGTGTGGATTACAGCGTCCAGTGTGGCATTATTAGGTTAATCCATAGCTATTAATACCTGTGGCAATAGTAAGAGGTTTCAAGAGATGAACACACAGCTGAAAGTGGGGAGTAGGGAGTGGTTGCTGTCTTATTTTAATGTCTCTCTGGGCCTGATAATTTTAAAAGCCTTGCATTCTTCGGATACAAGTTCCTTTCTTTTTTTCAAGAACAACAGAGAAAGGCTGCCTGGCTACCTTTACCTGCAGCACCTCTGATAGTGGCTGCTTGGCAGCTGGTTAGGTAAACCCAATTTCTACTGGATAAACTTCATGGACATTCACCCATGTTAGTACTGGGGAATGCTCTGAAATAGCAAACTTGTCCATACAGCTCCCTTGCTGCTCAGAGGGTCTGGAGAGGAACAGCAGAGGCCCAGCCCCAAATACTTCTTGTACCGGCTTCTATTATGAGAGGTGATGGCACCTTTAGGCACTTATATTTCAGGTAGAAGGAGCAGGAATAAAGAAGAGAGGGCCACAATCTGAAATATCAGTTCTGGATCTCCCTGGGGGCAAGACACTGGCCCTTAGCTGAGCTATTCTGTCATCACTGTGAGCCCTCATGCTCGACCTCACCGGAGCACCCCTCGATCATGTGCTGAATGCCTAGCACGCGCCTGGCACTGTGGGGTACAGAAATCTAGAGCACAGAGCTGAGTTTTCAGGGCTTAGCTGTCCTCACATCTGTGACACAAACAAGGATCAAGGAAAGTTTTGTTTTTGTTATTTTTATTTTTTTTTAAGTTGGGGCCTTACTCTGTAGCCCAAGCTAGAGTGCAGTGGTGCAAACATAGCTCATTGCACCCTCGAACTCCTGGGCTCAAGTCATCCTCCCACCTTGGCCTCCAGAGTAGCTGGAAGCCCAGCTCAAGGAAAGCATTAACATGTTCTACTTTTCACCATAACTCCTGTGTATTGGGAAATGGAAAATGAGCTTTTGAAGTGGCCTATTCGCTTCCTCCATGAGTACTTCCTGAACACTTTCCTCTTGGCCACTGCTCCCAAGACCCCCTCAAAGTCTCTCTCCTGGACTTCCAGCTTCTGGAATGAGCCCCAAGAGTGGCACTTAGCTCCAGCTCCTAGTAGCTCACTCCTTCCTTGCTCACCTGTGCCCCCCACCTGTCTCCTTAGGGGCAGTGGCTTTGTCTTACTCATTTGCATATCACAAATGTCCAGCATAGAAGCCAGCATGGGGCCCAGTAAGTGTTTGTGGAGAAAGAAAGGGAAGAGGTGAAGAAGGGAGGAGACGGAGGGAGAAAAGGAGGGCAAGAGAGGGGATGTCATGTCATGTCATGTCATGTCATGTCAGTTCTCCCTTCCTAGATCTGGACAAGGGAAAGAAAAGCAAGCAAGCCTGACCGCCTCAGTGCAGACCCTCTAAGATGCAAGCCTTCCCCAGGTAAGACAGTTCCGCAGGGTGCTTCTGTGGGCTGGCTTTCTGAGCAGCCATCTCAAAGTATGCGAAAGAAATAGATTTTGGGCCGGGCACGGTGGCTCATGCCTGTAATCCCAGCACTTTAGGAGGCCGAGGCGTGTGGATCACCTGAAGTCAGGAGTTCAAGACCAGCCTGGCCAACATGGTGAGACCTTGTATCTACTAAAAATACAAAAACTAGCCGGGCTTGGTGGCGCATGCCTGTAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATCGCTTGAACCCAGGAGGTAAATGTTGCAGTAAGCCGAGATTGCCCCACTGCACTCCAGCCTGGATGACAAGAGTGAGACTCCATCTCAAAAAAAAAAAAAAAAAAAGATTTTAGGGGTAAAATATTTTGCTTTCCTTCAGTGGCTGGGCCACCTGTACACAATGCCTGAACCCAGGTCCTGCAGCCCCCAGCTTGGCTCTCCAAAGCTCACCAGTATCAAAGGATTTCAGCTTCTGCTCCTCTCCGCAGGGCTCTCTGGTGGGCCCCAGACAGAGATGAAGGAAAAGTCTGGCCTTGACCTTGAGGACCAAAAGCGACCACGTC'

In [47]:
TLR7 = 'ATGGGCGTGGGACCCTCCGAGCCAGGCGCGGGATATAATCTCCTGGTGTGCCATTTGCTAAGACCATTGGAAAAGCACAGTATTAGGGTGGGAGTGACCCGATTTTCCAGGTGCTGTGTGTCACAGCTTTGCTTGGCTATGAAAGGGAATTCCCTGACCCCTTGTGCTTCCCAGGTGAGGCAATGCCTCGCCCTGCTTTGGCTCATGCTCGGTGTGCTGCACCCACTGTCCTGCGCCCACTGTCTGACAAGCCCCAGTGAGATGAACCCAGTACCTCAGTTGGAAATGCAGAAATCACCCATCTTCTGCATTGCTCACGCTGGGAGCTGTAGACTGGAGCTGTTCCTATTCAGCCATCTTGGAACCACCCTACAGTTTCTTTCAATACTTTTCATGTTCAATGTCTTACACTTAAATTTTTCAACTACATGAAATTTATTTTGATATAAGAAATGAGGTTGGGCCAGGCGCAGTGGCTCACACCTGTTATCCCAAAACTTTGGGAGGCTGAAGTGGGCGAATCATGAGATCAGGAGTTCAAAACCAGCCTGGCCAATATAGTGAAACCCCACCTCTACAGAAAATACAAAAAATTAGCCGAGCGTGGTGGCGGGTGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCTGGGAGGTGGAGGTTGCAGTGAGCAGAGATTGCGCCACTGCACTCCAGCCTGGGTGACAGAGTGAGACTACATCTCAAAAAATAAATAAAAAGACAAATGAGGTTGGACAATAGCTTGATTTTTTAAAAACTCCAAGCCAGTTGCCCAATAATCCATTTTGCCATTGATGTGACTTGAAACTAAACACCCATATGTGCTTTTACTTAATTCTGCTCACTTCTCAGTTTCACTGATGCACTGGTACTATGCCATTGGAGGCAATACAGCATTCTAATGCATTTACAGATATGGAAGAGAAATCCTCCCCTTTTATTTTCATTCAGAAATTGCCTGGCTATTCTCACATGTGGTCTTACCCCAGTCAGACCCAACACCTCACTTTTATAACAAATATTTGGTAATGCACCCTTTGTTATATGAAAGGAGATATTTGTGGATAATGTAACCCCAGTCTTCATGATAAACAAAAAGGCCCAGCTGATTTCCAAAATGCACCCCAGTTTAGAATCAGTCTGGCAAGTATCACATGAAATCCTATTGGTATTTGATTGGGATCACACAGCATTTGCAAATCAATTTAAGCAACATTTCTATTTTTACAATATTGTGGCCTCTAGCCCCAATAACAATTATTTCTCTTCATTTACTTATATCTTTGTGTCTGGGCAGGGTCCTGCCAGGAAACAGACGGCATGTTAAAGTGAGAAACTGATGAGTTCAGCAAAGTGACTATTTATATATTTGGACAGGATTTAAGGAAGTAAGAAAGGATGATGCAACACTTCAGAGGGGAGTCTTTCCGACCTCAGGCTGAAGGAGAAGGAACGATTACTGGAATTCAGGGAGGAGAGCATCACCAAACAAGAGCTTCATTAGAGGACTGCAGCCAACGCAGGGCCAGGCGGAGGGAGCCAGAGGGAGGCAGGCTCTGCTCTCCCTCTTCCTGCCCTTCAGTTTCCAACCACGGCCTCCTATTGGCCAAACCCAACCAGAAGCCAGCGAGCAAGGGGCTACTGATGAAGCACATATAGCTCAGCCTCCAGAGACACAGAACAGGATAAAAGGCTGAGACAGTGGGTCTGGTGGGGCAAAGAGAAAGCTTGCACCTGCCAGGTAAAGCATTATAGTCCCCATCCTCCCCCCACCACCTTAGTTCTTGTGCATTTCCCATCAGTTTTCTTCCAAAGCATTTCAGATCTCACTGATTTGAATGGGACCTCTTCTTCTATTACCAATTTGGATCAGTAATTATTTATGTATGAGAAAACTATTGATTTTTACATATCGTTGCATAGCAGAGTGGTTTAATAAGGAGACATTTGGTTTTTACTGCCTGTGTGGGTATCACTTGCTATGTGACTTGAGGCAAATCCAATATTTCTTCTGTTATAAATTCCAGTATTTGTAAAAAATGGGTAATAAGATCTCTATTTTATATAGTTTTAGTATTTAATGAGATAATACATATAAAGTCATTAAAACAGTGTCTGGCTCATAAAAAACCCTCAATAAATGTCACTTATTACTGTATCTGGTTTTTGAGCTGCTCTATTGCACCATTGAGTTTTCAGCCCAGTATATGTTAACCCTGATCATTATCTGCAGAAGTCCCCGTGCCACACTCTACATCATCCAAATTCTCTCCAGGTGGACTAAGTAGATTAAAGAACTTTAAACATAACTACCATATTTTGGCTCTATCTACAAAATGTCCAATAATCAGTTAAGAAAGGAACAATTCTCTTGGGGCCCACACTTTGAGAAGCAAATGCAGCTGAACTTTTTTAGAGGAAAGTGAGTGAACCAACTGGTAGCTTTGCCACTGCTTAAAAACCAGCATCCTTTCCAGCTGGGTCTAAGACAGAATAAGGTAAATTTAGATATGTCTCTAATATATCTATAGAACAGTGGTTCTCAACCCGGGGTGTTTTTGCCCCTTAGGGGATAATTTGCAATGTCTGGAGACATCTGTGATTGTCATAACTGGAAGGGGGCAGTGCTATTGGCATCTAGTGGGTATAGAGCAAGGGTGCTACCAAATATCCTATGGTGCAACAGAGAATTATCTGGTCAAAAATGTAAATAGTGCTGAGGGTGAGAAACCCTGCTATAAAAACGAAAGAAATTTGGTCTACAGAGTTGTTTGGATTTAGACAAGACGTTGCCCCAATAGTGGTGATAGAAATAAGAGGAACCCCGTGCTTTTGCAAAGCCCATATCTGGGGTGGCTTAAATAATCATGCTCCTCCCCATCCCCCGACCTGATCTTTGTAGTTGGAAACTCCAGGGCTGGCTGCCTGTAGTCTTTGTGACTACACTTCCTGCCTCCCATC'

In [5]:
positions = [751,768]

In [7]:
start, stop = positions

'GAGAATGAAACCCATAG'

In [8]:
ifnb1[start:stop]

'GAGAATGAAACCCATAG'

In [9]:
len(ifnb1[start:stop])

17

In [12]:
from Bio.Seq import Seq

ifnb1_dna = Seq(ifnb1)

In [14]:
ifnb1_dna

Seq('GTAGTTTCTTTTTTCTTTGAGAGCCATCATCACCATCATGGTTGACACCATGAA...GAC')

In [23]:
ifnb1_dna[3000-2249:3000-2232]

Seq('GAGAATGAAACCCATAG')

In [13]:
ifnb1_dna_reverse = ifnb1_dna.reverse_complement()

In [15]:
ifnb1_dna_reverse

Seq('GTCCTTTCTCCATGGGTATGGCCTATTTATATGAGATGGTCCTCTCTCTATTCA...TAC')

In [20]:
positions2 = [2232,2249]
start2, end2 = positions2

In [21]:
ifnb1_dna_reverse[start2:end2]

Seq('CTATGGGTTTCATTCTC')

## Threshold 3.0

In [49]:
match1 = [751, 768] # IRF3
match2 = [2903, 2920] # IRF3
match3 = [2909, 2926] # IRF3
match4 = [1092, 1105] # IRF7
match5 = [1097, 1110] # IRF7
match6 = [1149, 1162] # IRF7
match7 = [2694, 2709] # IRF9

In [50]:
start1, end1 = match1 # IRF3
start2, end2 = match2 # IRF3
start3, end3 = match3 # IRF3
start4, end4 = match4 # IRF7
start5, end5 = match5 # IRF7
start6, end6 = match6 # IRF7
start7, end7 = match7 # IRF9

In [None]:
# threshold 3.0
print(ifnb1[start1:end1]) # IFNB1
print(ifnb1[start2:end2]) # IFNB1
print(ifnb1[start3:end3]) # IFNB1
print(ifnb1[start4:end4]) # IFNB1
print(ifnb1[start5:end5]) # IFNB1
print(ifnb1[start6:end6]) # IFNB1
print(ifit1[start7:end7]) # IFIT1

GAGAATGAAACCCATAG
CATAGGAAAACTGAAAG
AAAACTGAAAGGGAGAA
ATAAAACAAAATA
ACAAAATAAATGT
CGAAATTACAAAT
ATAGAAACTGAAAAT


## Threshold 5.0

In [54]:
match1_t5 = [2903, 2920] # IRF3 in IFNB1
match2_t5 = [1149, 1162] # IRF7 in IFNB1
match3_t5 = [2585, 2602] # IRF3 in IFIT1
match4_t5 = [2691, 2708] # IRF7 in IFIT1
match5_t5 = [1281,1294] # IRF7 in IFIT1
match6_t5 = [2694,2709] # IRF9 in IFIT1
match7_t5 = [2738,2751] # IRF7 in TLR7
match8_t5 = [2777,2790] # IRF7 in TLR7

In [None]:
t5_start1, t5_end1 = match1_t5 # IRF3 in IFNB1
t5_start2, t5_end2 = match2_t5 # IRF7 in IFNB1
t5_start3, t5_end3 = match3_t5 # IRF3 in IFIT1
t5_start4, t5_end4 = match4_t5 # IRF7 in IFIT1
t5_start5, t5_end5 = match5_t5 # IRF7 in IFIT1
t5_start6, t5_end6 = match6_t5 # IRF9 in IFIT1
t5_start7, t5_end7 = match7_t5 # IRF7 in TLR7
t5_start8, t5_end8 = match8_t5 # IRF7 in TLR7

In [None]:
print(ifnb1[t5_start1:t5_end1]) # IFNB1 - IRF3
print(ifnb1[t5_start2:t5_end2]) # IFNB1 - IRF7
print(ifit1[t5_start3:t5_end3]) # IFIT1 - IRF3
print(ifit1[t5_start4:t5_end4]) # IFIT1 - IRF7
print(ifit1[t5_start5:t5_end5]) # IFIT1 - IRF7
print(ifit1[t5_start6:t5_end6]) # IFIT1 - IRF9
print(TLR7[t5_start7: t5_end7]) # TLR7 - IRF7
print(TLR7[t5_start8: t5_end8]) # TLR7 - IRF7

CATAGGAAAACTGAAAG
CGAAATTACAAAT
CAAGCTGAAACCAAAAG
AATATAGAAACTGAAAA
GCAAATTGAAAAC
ATAGAAACTGAAAAT
CAAAAATGTAAAT
TAAAAACGAAAGA


## Threshold 7.0

In [60]:
match1_t7 = [2903,2920] # IRF3 - IFNB1
match2_t7 = [2888,2901] # IRF7 - IFNB1
match3_t7 = [2691,2708] # IRF3 - IFIT1
match4_t7 = [1574,1587] # IRF7 - IFIT1
match5_t7 = [2590,2603] # IRF7 - IFIT1
match6_t7 = [2696,2709] # IRF7 - IFIT1
match7_t7 = [2694,2709] # IRF9 - IFIT1
match8_t7 = [2738,2751] # IRF7 - TLR7
match9_t7 = [2777,2790] # IRF7 - TLR7

In [61]:
t7_start1, t7_end1 = match1_t7
t7_start2, t7_end2 = match2_t7
t7_start3, t7_end3 = match3_t7
t7_start4, t7_end4 = match4_t7
t7_start5, t7_end5 = match5_t7
t7_start6, t7_end6 = match6_t7
t7_start7, t7_end7 = match7_t7
t7_start8, t7_end8 = match8_t7
t7_start9, t7_end9 = match9_t7

In [63]:
print(ifnb1[t7_start1:t7_end1]) # IRF3 - IFNB1
print(ifnb1[t7_start2:t7_end2]) # IRF7 - IFNB1
print(ifit1[t7_start3:t7_end3]) # IRF3 - IFIT1
print(ifit1[t7_start4:t7_end4]) # IRF7 - IFIT1
print(ifit1[t7_start5:t7_end5]) # IRF7 - IFIT1
print(ifit1[t7_start6:t7_end6]) # IRF7 - IFIT1
print(ifit1[t7_start7:t7_end7]) # IRF9 - IFIT1
print(TLR7[t7_start8:t7_end8]) # IRF7 - TLR7
print(TLR7[t7_start9:t7_end9]) # IRF7 - TLR7

CATAGGAAAACTGAAAG
CTAAAATGTAAAT
AATATAGAAACTGAAAA
CCAAAAGGAAAAT
TGAAACCAAAAGT
AGAAACTGAAAAT
ATAGAAACTGAAAAT
CAAAAATGTAAAT
TAAAAACGAAAGA


## Threshold 10.0

In [66]:
match1_t10 = [2590,2603] # IRF7 - IFIT1
match2_t10 = [2696,2709] # IRF7 - IFIT1

In [67]:
t10_start1, t10_end1 = match1_t10
t10_start2, t10_end2 = match2_t10


In [68]:
print(ifit1[t10_start1:t10_end1])
print(ifit1[t10_start2:t10_end2])

TGAAACCAAAAGT
AGAAACTGAAAAT
