This notebook has the following dependencies. Make sure that these are installed in your environment when you launch this notebook.
+ pandas
+ biopython
+ numpy
+ openpyxl

For more information refer to the [website](https://gitlab.com/NCDRlab/easy_hcr)

In [4]:
import pandas as pd
import numpy as np
import csv
from insitu_probe_generator.maker37cb import maker
from insitu_probe_generator.start import start
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIXML

In [2]:
# Set the quality control parameters for the probeset # if you do not want to remove the GC's you can write here 'False'

remove_gc = True
probepair_number_cutoff = 33
blast_on_genome = True

In [None]:
# Run the probe generator

strt = start()
name,fullseq,amplifier,pause,choose,polyAT,polyCG,BlastProbes,db,dropout,show,report,maxprobe,numbr = strt[0],strt[1],strt[2],strt[3],strt[4],strt[5],strt[6],strt[7],strt[8],strt[9],strt[10],strt[11],strt[12],strt[13]
string_output = maker(name,fullseq,amplifier,pause,choose,polyAT,polyCG,BlastProbes,db,dropout,show,report,maxprobe,numbr)

In [None]:
# Get and re-usethe parameters specified for the probe generator

gene_name = name
amplifier = amplifier

print("Gene: ", gene_name)
print("Amplifier: ", amplifier)

In [None]:
# Save the generated probes as CSV file

csv_path = f"output/{name}_probes.csv"

probe_file = open(csv_path, "w")
n = probe_file.write(string_output)
probe_file.close()

# Read the csv back in as table

df = pd.read_csv(
    f"output/{name}_probes.csv",
    index_col=0,
)
print("Gene: ", gene_name)
print("Amplifier: ", amplifier)
print("Probes:")
df.head()

In [None]:
# Generate probe identifiers

probe_identifiers = [gene_name + "_PP_" + str(n + 1) for n,_ in enumerate(df.iterrows())]
probe_identifiers = [f"{gene_name}_{amplifier}_PP_{str(n + 1)}" for n,_ in enumerate(df.iterrows())]
df.index = probe_identifiers
df.head()

In [None]:
# Check GC-ending probes for later
# GC and the end for the forst probe and GC at the start for the second

df["GC_QC"] = df["Probe"].str.endswith("GC") | df["Probe"].str.endswith("CG") | df["Probe.1"].str.startswith("GC") | df["Probe.1"].str.startswith("CG")
print("Number of probe pairs:", len(df))
print(f"{len(df.loc[df.GC_QC == True])} GC probe pairs")
print(f"Kept {len(df.loc[df.GC_QC != True])} no-GC probe pairs")
df


In [None]:
# Generate BLAST sequences

df["blast"] = df["Probe"] + "NN" + df["Probe.1"]
df.head(20)

In [None]:
# Create a simple dataframes for BLAST

df_blast = pd.DataFrame(df.index.values, columns=["Identifier"])
df_blast["Probe"] = df.blast.values
df_blast.to_csv(f"output/{gene_name}_probes.txt",sep='\t',index=False,header=False, quoting=csv.QUOTE_NONE)
df_blast.head()

In [None]:
# Generate a list of SeqRecord to write a fasta file

probe_sequences = [SeqRecord(Seq(probe), id=id) for probe, id in zip(df_blast.Probe.values, df_blast.Identifier.values)]

# Print first 5 as example
probe_sequences[:5]

In [None]:
# Save fasta file in output folder (create this output folder in the same folder as the probe generator)

fasta_path = f"output/{gene_name}_to_blast.fasta"

with open(fasta_path, "w") as output_handle:
    for record in probe_sequences:
        SeqIO.write(record, output_handle, "fasta")

In [None]:
# Load in blastn
from Bio.Blast.Applications import NcbiblastnCommandline

In [None]:
# Load query file
fasta_string = fasta_path
fasta_path

'output/OvTH_to_blast.fasta'

In [None]:
# Load custom database file
db_path = "input/vulgaristranscriptome.fasta"

#db_path = "input/blast2go/vulgaristranscriptome.fasta"
db_string = db_path

In [None]:
# Path of the BLAST output

blast_path = f"output/{gene_name}_blast_output.xml"

In [None]:
# Set up blast with default parameters (word size needs to be specified due to short query length)
cline = NcbiblastnCommandline (query = fasta_path, db = db_path, evalue = 0.05, out = blast_path, outfmt = 5, penalty = "-3", reward = "2", gapopen = "5", gapextend = "2", word_size = "11")

In [None]:
#less stringent blast (higher e-value - blast2go = 0.1)
cline = NcbiblastnCommandline (query = fasta_path, db = db_path, evalue = 0.2, out = blast_path, outfmt = 5, word_size = "11")

In [None]:
# Print blast command
print(cline)

blastn -out output/OvTH_blast_output.xml -outfmt 5 -query output/OvTH_to_blast.fasta -db input/vulgaristranscriptome.fasta -evalue 0.2 -word_size 11


In [None]:
# Run blast
stdout, stderr = cline()

In [None]:
# Check stdout
print(stdout)




In [None]:
# Check stderr
print(stderr)




In [None]:
result_handle = open(blast_path)

In [None]:
blast_records = list(NCBIXML.parse(result_handle))
result_handle.close()

In [None]:
# Generate a table of blast results grouped per probe pair # you can check the transcript ids here

queries = []
names = []

for record in blast_records:
    for alignment in record.alignments:
        queries.append(record.query.split(" ")[0])
        title = alignment.title
        name_parts = title.split("|")
        names.append(name_parts[-1])
        # print(name, hit_type)
    

df_blast_results = pd.DataFrame(queries, columns=["Probe"])
df_blast_results["Name"] = names
#df_blast_results["Type"] = types
df_blast_results

In [None]:
# Analyse blast results - shows the number of hits per probe pair
df_hits = pd.DataFrame(df_blast_results.Probe.value_counts())
df_hits.index = pd.Series(df_hits.index.values).str.split(" ", expand=True)[0]
df_hits = df_hits.rename(columns={"Probe":"n_hits"})

df_hits

In [None]:
# Removal of all probes that have more hits than the number of transcripts
# In this case we removed all probes that have more than 3 hits

hit_threshold = 2

to_exclude = df_hits.loc[df_hits.n_hits > hit_threshold].index.values.tolist()

print("Probes to remove:")
print(to_exclude)

In [None]:
# If you want to manually remove additional probes you can paste the name between the brackets
# This cell generates the list of the probes that need to be removed

manual_exclude = [
    
]

probes_to_remove = to_exclude + manual_exclude

probes_to_remove

In [None]:
# Remove unwanted probes

on_topic_indexes = [n for n, probe in enumerate(df.index.values) if probe not in probes_to_remove]
df_qc_blast = df.iloc[on_topic_indexes]
print(f"{len(on_topic_indexes)} probes kept:\n", df_qc_blast.index.unique().values)

df_qc_blast.head()

In [None]:
# Remove probes ending in GC or CG

if remove_gc == True:
    orig_number = len(df_qc_blast)
    print("Number of probe pairs:", orig_number)
    df_qc_blast = df_qc_blast.loc[df_qc_blast.GC_QC != True]
    print(f"Removed {orig_number - len(df_qc_blast)} probe pairs")
    print(f"Preserved {len(df_qc_blast)} no-GC probe pairs")
    df_qc_blast

In [None]:
# If there are more than 50 probe pairs, keep only the odd rows, otherwise cut off at the specified amount

n_probes = len(df_qc_blast)
print("Total number of probes: ", n_probes)

if len(df_qc_blast.iloc[::2]) > 25:
    df_qc_blast = df_qc_blast.iloc[::2]
    print(f"Kept odd rows ({len(df_qc_blast)} probes)")

df_qc_blast = df_qc_blast.iloc[:probepair_number_cutoff]

new_length = len(df_qc_blast)
n_probes = new_length
print("Number of kept probes: ", n_probes)

In [None]:
# Generate IDT order form

df_idt = pd.DataFrame(columns=["Pool name", "Sequence"])
for name, row in df_qc_blast.iterrows():
    name_parts = row.name.split("_")
    df_idt = df_idt.append({
        "Pool name": "_".join(name_parts[:-2]) + f"_{n_probes}PP",
        "Sequence": row.Initiator + row.Spacer.upper() + row.Probe
    },
    ignore_index=True)
    df_idt = df_idt.append({
        "Pool name": "_".join(name_parts[:-2]) + f"_{n_probes}PP",
        "Sequence": row["Probe.1"] + row["Spacer.1"].upper() + row["Initiator.1"]
    },
    ignore_index=True)
print(f"Generated an order of {len(df_qc_blast)} probe pairs ({len(df_idt)} oligos)")
df_idt

In [None]:
# Export order form

idt_path = f"output/{gene_name}_idt_order.xlsx"

df_idt.to_excel(idt_path, index=False)