### All Imports

In [1]:
from General.args import *
from General.utils import *
from PD_var_ILP.run_var_ilp import *
from PD_mul_ILP.run_mul_ilp import *
from PD_mul_greedy.run_mul_greedy import *
from PD_single_LPath.run_pd_single_LPath import *
import sys

### Set up gurobi optimization library


First, create a file called gurobi.json with the details of your gurobi license in the main directory. The file should be in the following format:

```json
{
  "WLSACCESSID": "XXXXX",
  "WLSSECRET": "XXXXX",
  "LICENSEID": 12345
}
```


### Global program cmd arguments

Set arguments to their default values

In [2]:
sys.argv = [
        sys.argv[0],
        "--file_path", "test_inputs",
        "--output", "Test_outputs",
        ]

args = get_args()


### Define global variables for upstream and downstream regions

In [None]:

# Define global upstream and downstream regions
UPSTREAM_NT = 'GCTAGTGGTGCTAGCCCCGCGAAATTAATACGACTCACTATAGGGTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACAT'
DOWNSTREAM_NT = 'GGAGGGTCTGGGGGAGGAGGCAGTGGCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCG'


### Run PD-Single-LPath

#### Define sequence
First, we will define our mutreg region sequence, we will take the first 100 nucleotides of the coding sequence of the CXDAR human protein gene as a small test dataset.

In [3]:
protein_name = 'CXDAR'
mutreg_nt = 'ATGGCGCTCCTGCTGTGCTTCGTGCTCCTGTGCGGAGTAGTGGATTTCGCCAGAAGTTTGAGTATCACTACTCCTGAAGAGATGATTGAAAAAGCCAAAG'
## Add upstream and downstream sequences
full_sequence = UPSTREAM_NT + mutreg_nt + DOWNSTREAM_NT

#### Run longest path algorithm

We will runs PD-Single_LPath which runs a longest path algorithms on the primer graph, which is a DAG, to find the most efficient primers.

In [None]:
run_df, primer_paths = run_longest_path(full_sequence, mutreg_nt, protein_name,args) 

print("Results DataFrame:")
run_df

#### Selected primer path and primer efficiency

Print all selected primers in longest path and their efficiency

In [None]:
print("Primer Paths:")
for path in primer_paths:
    print(path)

print("Primer effiency")
run_df['longest_path_efficiency']

### Run PD-var-ILP

We will run PD-var-ILP on multiple variants of the same protein.

For this part, we will use the same sequence of the CXDAR protein as PD-single-LPath.
We will use the default number of variants, which is 3, and the deafult maximum overlap length, which is 6. 

In [None]:
results_df, ilp_path, greedy_path  = run_var_ilp(full_sequence, mutreg_nt, protein_name,args)
print("Variable ILP Results DataFrame:")
results_df

#### Compare PD-Var-ILP with greedy baseline

We will compare the primers found by the ILP and by the greedy approach and their total efficiencies.

In [None]:
print("ILP chosen primers:")

for i,path in enumerate(ilp_path):
    print("Variant", i+1)
    print(path)

print("ILP primer efficiency: ",results_df['ILP_primer_efficiency'])


print("Greedy chosen primers:")
for i,path in enumerate(greedy_path):
    print("Variant", i+1)
    print(path)

print("Greedy primer efficiency: ",results_df['greedy_primer_efficiency'])

### Running PD-mul-ILP

We will run PD-mul-ILP on multiple non-homologous proteins. For this part, we will add another partial 100-nt sequence of the SHP2 protein

In [8]:
protein_name1 = 'CXDAR'
mutreg_nt1 = 'ATGGCGCTCCTGCTGTGCTTCGTGCTCCTGTGCGGAGTAGTGGATTTCGCCAGAAGTTTGAGTATCACTACTCCTGAAGAGATGATTGAAAAAGCCAAAG'
## Add upstream and downstream sequences
full_sequence1 = UPSTREAM_NT + mutreg_nt1 + DOWNSTREAM_NT

protein_name2 = 'SHP2'
mutreg_nt2 = 'ATGACATCGCGGAGATGGTTTCACCCAAATATCACTGGTGTGGAGGCAGAAAACCTACTGTTGACAAGAGGAGTTGATGGCAGTTTTTTGGCAAGGCCTA'
full_sequence2 = UPSTREAM_NT + mutreg_nt2 + DOWNSTREAM_NT

## create protein dataset
mutreg_regions = [mutreg_nt1, mutreg_nt2]
sequences_nt = [full_sequence1, full_sequence2]
protein_names = [protein_name1, protein_name2]


#### Run PD-mul-ILP 2 non-homolous proteins parts
PD-mul-ILP uses an integer linear program with forbidden pair constraints on all cross-hybridizing pairs.

In [None]:

run_df, primer_paths = run_mul_ilp(mutreg_regions, sequences_nt, protein_names, args)

print("PD-mul_ILP Results DataFrame:")
run_df

#### Primer slection and total efficiency

We will look at the primer paths found for each protein by PD-mul-ILP, and the total efficiency of the selected primers.

In [None]:
print("Chosen primers:")

for protein, path in primer_paths.items():
    print("Protein:", protein)
    print(path)

total_efficiency = run_df['total_primer_efficiency'].prod()   
print("Total primer efficiency:")
print(total_efficiency) 

### Running PD-mul-Greedy

We will run PD-mul-Greedy, which uses a greedy algorithm to find efficient primers paths for each protein, on multiple non-homologous proteins. 
We will use the same proteins as we used for the PD-mul-ILP run. We will use the same proteins as PD-mul-ILP

In [None]:

run_df, paths = run_mul_greedy(sequences_nt,mutreg_regions, protein_names, args)

print("PD-mul_greedy Results DataFrame:")
run_df

#### Primers found for each protein total efficiency

We will examine the total efficiency and the primers found for each of the proteins by the greedy algorithm

In [None]:
for protein, path in paths.items():
    print("Protein:", protein)
    print(path)

total_efficiency = run_df['total_primer_efficiency'].prod()   
print("Total primer efficiency:")
print(total_efficiency)
