### All Imports

In [1]:
from General.args import *
from General.utils import *
from PD_var_ILP.run_var_ilp import *
from PD_mul_ILP.run_mul_ilp import *
from PD_mul_greedy.run_mul_greedy import *
from PD_single_LPath.run_pd_single_LPath import *
import sys

### Set up gurobi optimization library


First, create a file called gurobi.json with the details of your gurobi license in the main directory. The file should be in the following format:

```json
{
  "WLSACCESSID": "XXXXX",
  "WLSSECRET": "XXXXX",
  "LICENSEID": 12345
}
```


### Global program cmd arguments

Set arguments to their default values

In [2]:
sys.argv = [
        sys.argv[0],
        "--file_path", "test_inputs",
        "--output", "Test_outputs",
        ]

args = get_args()


### Define global variables for upstream and downstream regions

In [3]:

# Define global upstream and downstream regions
UPSTREAM_NT = 'GCTAGTGGTGCTAGCCCCGCGAAATTAATACGACTCACTATAGGGTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACAT'
DOWNSTREAM_NT = 'GGAGGGTCTGGGGGAGGAGGCAGTGGCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCG'


### Run PD-Single-LPath

#### Define sequence
First, we will define our mutreg region sequence, we will take the first 100 nucleotides of the coding sequence of the CXDAR human protein gene as a small test dataset.

In [4]:
protein_name = 'CXDAR'
mutreg_nt = 'ATGGCGCTCCTGCTGTGCTTCGTGCTCCTGTGCGGAGTAGTGGATTTCGCCAGAAGTTTGAGTATCACTACTCCTGAAGAGATGATTGAAAAAGCCAAAG'
## Add upstream and downstream sequences
full_sequence = UPSTREAM_NT + mutreg_nt + DOWNSTREAM_NT

#### Run longest path algorithm

We will runs PD-Single_LPath which runs a longest path algorithms on the primer graph, which is a DAG, to find the most efficient primers.

In [5]:
run_df, primer_paths = run_longest_path(full_sequence, mutreg_nt, protein_name,args) 

print("Results DataFrame:")
run_df

Creating primer df
Creating graph
Saved CSV summary to: Test_outputs/PD_single_LPath_results.csv
Saved path details to: Test_outputs/PD_single_LPath_selected_primers.json
Results DataFrame:


Unnamed: 0,protein_name,graph_nodes,graph_edges,graph_time_sec,longest_path_efficiency,total_time_sec,num_primers
0,CXDAR,2550,160566,1.835,1.511491,6.713,4


#### Selected primer path and primer efficiency

Print all selected primers in longest path and their efficiency

In [6]:
print("Primer Paths:")
for path in primer_paths:
    print(path)

print("Primer effiency")
print(run_df['longest_path_efficiency'].prod())   

Primer Paths:
(-80, -60, 'f')
(96, 115, 'r')
(70, 96, 'f')
(244, 265, 'r')
Primer effiency
1.5114909044453393


### Run PD-var-ILP

We will run PD-var-ILP on multiple variants of the same protein.

For this part, we will use the same sequence of the CXDAR protein as PD-single-LPath.
We will use the default number of variants, which is 3, and the deafult maximum overlap length, which is 6. 

In [None]:
results_df, ilp_path, greedy_path  = run_var_ilp(full_sequence, mutreg_nt, protein_name,args)
print("Variable ILP Results DataFrame:")
results_df

Creating primer df
Creating graph
Running greedy algorithm
Running ILP
Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2499075
Academic license 2499075 - for non-commercial use only - registered to jo___@live.biu.ac.il
Number of Constraints: 5520
Average Vars Per Constraint 80.94384057971014
Graph edges:  160566
Finished ILP Variable Creations
0
3
7
11
15
19
23
27
31
35


#### Compare PD-Var-ILP with greedy baseline

We will compare the primers found by the ILP and by the greedy approach and their total efficiencies.

In [8]:
print("ILP chosen primers:")

for i,path in enumerate(ilp_path):
    print("Variant", i+1)
    print(path)

print("ILP primer efficiency: ",results_df['ILP_primer_efficiency'].prod())


print("Greedy chosen primers:")
for i,path in enumerate(greedy_path):
    print("Variant", i+1)
    print(path)

print("Greedy primer efficiency: ",results_df['greedy_primer_efficiency'].prod())

ILP chosen primers:
Variant 1
["(-86, -68, 'f')", "(87, 109, 'r')", "(59, 85, 'f')", "(244, 263, 'r')"]
Variant 2
["(-74, -54, 'f')", "(106, 124, 'r')"]
Variant 3
["(-58, -34, 'f')", "(123, 141, 'r')"]
ILP primer efficiency:  2.4237929216219447
Greedy chosen primers:
Variant 1
[(-80, -60, 'f'), (96, 115, 'r'), (70, 96, 'f'), (244, 265, 'r')]
Variant 2
[(-58, -34, 'f'), (123, 141, 'r')]
Variant 3
[(-39, -9, 'f'), (136, 156, 'r')]
Greedy primer efficiency:  1.514172485959763


### Running PD-mul-ILP

We will run PD-mul-ILP on multiple non-homologous proteins. For this part, we will add another partial 100-nt sequence of the SHP2 protein

In [9]:
protein_name1 = 'CXDAR'
mutreg_nt1 = 'ATGGCGCTCCTGCTGTGCTTCGTGCTCCTGTGCGGAGTAGTGGATTTCGCCAGAAGTTTGAGTATCACTACTCCTGAAGAGATGATTGAAAAAGCCAAAG'
## Add upstream and downstream sequences
full_sequence1 = UPSTREAM_NT + mutreg_nt1 + DOWNSTREAM_NT

protein_name2 = 'SHP2'
mutreg_nt2 = 'ATGACATCGCGGAGATGGTTTCACCCAAATATCACTGGTGTGGAGGCAGAAAACCTACTGTTGACAAGAGGAGTTGATGGCAGTTTTTTGGCAAGGCCTA'
full_sequence2 = UPSTREAM_NT + mutreg_nt2 + DOWNSTREAM_NT

## create protein dataset
mutreg_regions = [mutreg_nt1, mutreg_nt2]
sequences_nt = [full_sequence1, full_sequence2]
protein_names = [protein_name1, protein_name2]


#### Run PD-mul-ILP 2 non-homolous proteins parts
PD-mul-ILP uses an integer linear program with forbidden pair constraints on all cross-hybridizing pairs.

In [10]:

run_df, primer_paths = run_mul_ilp(mutreg_regions, sequences_nt, protein_names, args)

print("PD-mul_ILP Results DataFrame:")
run_df

[STEP] Creating graphs for 2 proteins...
Creating Graph for protein:  0 Protein name:  CXDAR
Creating primer df
Creating graph
Creating Graph for protein:  1 Protein name:  SHP2
Creating primer df
Creating graph
[DONE] Graphs created in 20.00 sec (peak 136.5 MB).
[STEP] Finding forbidden pairs across proteins...
Finding intra-protein forbidden pairs constraints
[1/2] Processing CXDAR...
[2/2] Processing SHP2...
Number of intra-protein forbidden pairs constraints:  0
Finding inter-protein forbidden pairs for 1 combinations...
[1/1] Fractions of pairs considered
Number of inter-protein forbidden pairs constraints:  0
[DONE] Forbidden pairs in 24.85 sec (intra=0, inter=0).
[STEP] Running ILP model...
Creating ILP model...
Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2499075
Academic license 2499075 - for non-commercial use only - registered to jo___@live.biu.ac.il
Adding constraints to ILP model...
Gurobi Optimizer version 12.0.3 build v12.0.3rc0 (lin

Unnamed: 0,num_proteins,graph_time_sec,ilp_num_vars,ilp_num_constraints,ilp_intra_forbidden_cnt,ilp_inter_forbidden_cnt,forbidden_time_sec,ilp_setup_time_sec,ilp_optimize_time_sec,ilp_feasibility,total_primer_efficiency,num_primers
0,2,20,321132,5100,0,0,24.847219,13.202425,1.366277,FEASIBLE,3.375891,8


#### Primer slection and total efficiency

We will look at the primer paths found for each protein by PD-mul-ILP, and the total efficiency of the selected primers.

In [11]:
print("Chosen primers:")

for protein, path in primer_paths.items():
    print("Protein:", protein)
    print(path)

total_efficiency = run_df['total_primer_efficiency'].prod()   
print("Total primer efficiency:")
print(total_efficiency) 

Chosen primers:
Protein: CXDAR
["(-80, -60, 'f')", "(96, 115, 'r')", "(70, 96, 'f')", "(244, 265, 'r')"]
Protein: SHP2
["(-80, -60, 'f')", "(98, 118, 'r')", "(69, 92, 'f')", "(244, 264, 'r')"]
Total primer efficiency:
3.3758909432490496


### Running PD-mul-Greedy

We will run PD-mul-Greedy, which uses a greedy algorithm to find efficient primers paths for each protein, on multiple non-homologous proteins. 
We will use the same proteins as we used for the PD-mul-ILP run. 

In [12]:

run_df, paths = run_mul_greedy(sequences_nt,mutreg_regions, protein_names, args)

print("PD-mul_greedy Results DataFrame:")
run_df

[INFO] Processing protein 0/2: CXDAR
Creating primer df
Creating graph
Creating primer df
Creating graph
 Saved summary to: Test_outputs/summary.csv
 Saved per-protein metrics to: Test_outputs/per_protein_metrics.csv
 Saved paths to: Test_outputs/primers_per_protein.json
PD-mul_greedy Results DataFrame:


Unnamed: 0,num_proteins,total_primer_efficiency,greedy_time_sec,cross_hybridizations_cnt,proteins_with_reiterations_cnt,total_reiterations,unresolved_proteins_cnt,unresolved_proteins
0,2,3.375891,13.527,0,0,0,0,


#### Primers found for each protein total efficiency

We will examine the total efficiency and the primers found for each of the proteins by the greedy algorithm

In [13]:
for protein, path in paths.items():
    print("Protein:", protein)
    print(path)

total_efficiency = run_df['total_primer_efficiency'].prod()   
print("Total primer efficiency:")
print(total_efficiency)


Protein: CXDAR
[(-80, -60, 'f'), (96, 115, 'r'), (70, 96, 'f'), (244, 265, 'r')]
Protein: SHP2
[(-80, -60, 'f'), (98, 118, 'r'), (69, 92, 'f'), (244, 264, 'r')]
Total primer efficiency:
3.3758909432490496
