<a href="https://colab.research.google.com/github/OrensteinLab/PrimerDesigner/blob/main/Mutagenesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Notes:**
1. Use a VM Machine that has >= 150 GB of RAM
2. Put in the proper account information for Gurobipy solver

#Imports


In [None]:
!pip install gurobipy  # install gurobipy, if not already installed



In [None]:
!pip install -U bokeh seaborn pandas
!pip install git+https://github.com/FordyceLab/seequence.git#egg=seequence
!pip install primer3-py biopython pandarallel

Collecting seequence
  Cloning https://github.com/FordyceLab/seequence.git to /tmp/pip-install-gcfi6ujf/seequence_89cece5317984d70a92b8b2a85a2abee
  Running command git clone --filter=blob:none --quiet https://github.com/FordyceLab/seequence.git /tmp/pip-install-gcfi6ujf/seequence_89cece5317984d70a92b8b2a85a2abee
  Resolved https://github.com/FordyceLab/seequence.git to commit 3ea730537fcf5b7ef807ebf6e057f5bf4e875bb9
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
# IMPORTS
import time

import random as rand

from seequence.view import *
from seequence.color import *

from pandarallel import pandarallel as pl
pl.initialize()

import primer3 as p3
from Bio.Seq import Seq
from Bio.SeqUtils import GC, seq1, seq3
from Bio.SeqUtils.CodonUsage import SharpEcoliIndex, SynonymousCodons

import itertools as it
import numpy as np
import pandas as pd
pd.set_option('display.precision', 1)

import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.models.annotations import Span
from bokeh.models import Select
from bokeh.layouts import column
from IPython.display import display, clear_output

import networkx as nx
import time
import gurobipy as gp

def revcomp(seq):
  return str(Seq(seq).reverse_complement())
def translate(seq):
  return str(Seq(seq).translate())

# clear_output()
print('Ready')

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Ready


#Setup

##Codon Chart

In [None]:
lst = []
for k,v in SynonymousCodons.items():
  if k!='STOP':
    for vv in v:
      lst.append((seq1(k), k, vv, SharpEcoliIndex[vv]))
codon_df = pd.DataFrame(lst, columns=['c1','c3','codon','freq']).sort_values(['c1','freq'], ascending=[True,False])
codon_df

Unnamed: 0,c1,c3,codon,freq
30,A,ALA,GCT,1.0e+00
27,A,ALA,GCA,5.9e-01
29,A,ALA,GCG,4.2e-01
28,A,ALA,GCC,1.2e-01
1,C,CYS,TGC,1.0e+00
...,...,...,...,...
55,V,VAL,GTG,2.2e-01
54,V,VAL,GTC,6.6e-02
52,W,TRP,TGG,1.0e+00
60,Y,TYR,TAC,1.0e+00


##Creating Sequence

In [None]:
upstream_nt = 'ATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCTAGTGGTGCTAGCCCCGCGAAATTAATACGACTCACTATAGGGTCTAGAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATG'
mutreg_nt = 'CAAAGCCCAGCACCTGCCGCAGCGCCTGCCCCTGCGGCACGTTCCATCGCAGCTACGCCTCCTAAACTGATCGTGGCAATTAGCGTGGACCAGTTTAGTGCAGACTTGTTCTCGGAGTATCGTCAATATTACACCGGAGGTTTAAAGCGTCTTACATCCGAAGGAGCTGTGTTCCCACGTGGTTATCAGAGTCATGCGGCAACAGAAACGTGTCCTGGTCACTCAACGATCCTGACAGGATCACGTCCGTCACGTACGGGTATTATCGCTAATAACTGGTTCGACTTGGACGCAAAGCGTGAGGATAAAAATCTGTACTGTGCTGAGGATGAATCCCAACCCGGTAGTTCGTCTGACAAGTACGAAGCTTCGCCACTGCACTTAAAGGTACCCACCCTGGGGGGACGCATGAAAGCCGCCAATCCTGCGACTCGTGTCGTCTCTGTTGCCGGCAAGGATCGCGCGGCCATTATGATGGGTGGCGCCACAGCGGATCAGGTCTGGTGGTTAGGGGGGCCTCAGGGGTATGTTTCGTATAAGGGTGTAGCGCCAACTCCCCTTGTAACACAGGTCAATCAGGCCTTTGCACAGCGCTTAGCTCAGCCGAACCCGGGATTTGAGTTGCCTGCTCAGTGCGTCAGCAAGGACTTTCCTGTTCAAGCGGGAAATCGCACAGTGGGTACCGGCCGCTTCGCCCGTGATGCTGGTGACTACAAAGGTTTTCGCATTTCCCCGGAGCAGGATGCTATGACGCTTGCATTCGCTGCCGCGGCCATTGAAAATATGCAATTAGGGAAGCAGGCCCAGACCGATATTATTAGCATTGGACTGAGCGCTACGGATTACGTGGGACACACCTTCGGCACGGAGGGTACGGAGAGTTGCATCCAAGTGGATCGTTTAGACACGGAGCTTGGTGCATTCTTTGATAAACTGGATAAGGATGGGATTGACTACGTAGTAGTGCTGACTGCAGATCATGGAGGACACGATCTGCCCGAACGTCATCGTATGAATGCCATGCCGATGGAACAGCGCGTAGACATGGCCCTGACACCTAAAGCTCTGAATGCTACCATCGCTGAGAAAGCTGGCCTTCCGGGCAAAAAGGTTATTTGGTCAGATGGACCTTCTGGCGATATTTACTATGATAAGGGCCTTACAGCCGCTCAACGTGCCCGTGTTGAAACCGAGGCGTTAAAATACTTGCGCGCGCATCCCCAAGTACAGACTGTATTCACTAAGGCGGAAATCGCGGCTACCCCTTCTCCGTCGGGACCACCTGAGAGCTGGAGTTTGATCCAGGAAGCTCGCGCGTCATTTTACCCGTCGCGCTCCGGGGACCTGTTACTTTTATTGAAACCTCGTGTGATGAGCATTCCTGAGCAAGCAGTCATGGGCTCGGTTGCAACCCATGGATCTCCATGGGATACGGATCGCCGTGTGCCTATCCTGTTTTGGCGCAAAGGTATGCAGCATTTCGAACAACCCTTAGGAGTAGAGACTGTTGATATTTTGCCCTCCTTGGCTGCACTTATTAAGCTTCCTGTTCCTAAGGATCAGATCGACGGCCGCTGTCTGGACTTGGTCGCCGGCAAGGATGATTCCTGTGCTGGACAG'
#mutreg_nt = 'CAAAGCCCAGCACCTGCCGCAGCGCCTGCCCCTGCGGCACGTTCCATCGCAGCTACGCCTCCTAAACTGATCGTGGCAATTAGCGTGGACCAGTTTAGTGCAGACTTGTTC'
downstream_nt = 'GGAGGAGGGTCTGGGGGAGGAGGCAGTGGCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACC'

sequence_nt = upstream_nt + mutreg_nt + downstream_nt
mutreg_l = len(mutreg_nt)
mutreg_start = len(upstream_nt)
mutreg_stop = mutreg_start + mutreg_l
mutreg_aa = translate(mutreg_nt)

##Mutations To Be...

In [None]:
# INPUT: mutations
mut_df = pd.DataFrame(columns=['lib_i','wt_aa','pos_aa','mut_aa']) #library #, wildtype, position, mutation
mut_df.pos_aa = mut_df.pos_aa.astype(int)

# GENERATE (V scan):
scan = pd.DataFrame()
scan['wt_aa'] = list(mutreg_aa)
scan['pos_aa'] = list(range(len(mutreg_aa)))
scan['mut_aa'] = ['V' if aa!='V' else 'A' for aa in mutreg_aa] #V: A, Else: V
scan['lib_i'] = 0
mut_df = pd.concat([mut_df, scan])

# GENERATE (P scan):
scan = pd.DataFrame()
scan['wt_aa'] = list(mutreg_aa)
scan['pos_aa'] = list(range(len(mutreg_aa)))
scan['mut_aa'] = ['P' if aa!='P' else 'A' for aa in mutreg_aa]#P: A, Else: P
scan['lib_i'] = 0
#mut_df = mut_df.append(scan)
mut_df = pd.concat([mut_df, scan])

# GENERATE (G scan):
scan = pd.DataFrame()
scan['wt_aa'] = list(mutreg_aa)
scan['pos_aa'] = list(range(len(mutreg_aa)))
scan['mut_aa'] = ['G' if aa!='G' else 'A' for aa in mutreg_aa]#G: A, Else: G
scan['lib_i'] = 1
#mut_df = mut_df.append(scan)
mut_df = pd.concat([mut_df, scan])

# v DO NOT EDIT v --------------------------------------------------------------
# nt (codon) info
mut_df['start_nt'] = 3*mut_df.pos_aa
mut_df['stop_nt'] = mut_df.start_nt+3
mut_df['wt_nt'] = mut_df.apply(lambda row: mutreg_nt[row.start_nt:row.stop_nt], axis=1) #finds codon between start & stop for a given row
mut_df['mut_nt'] = codon_df.set_index('c1').query('freq==1.0').loc[mut_df.mut_aa,'codon'].values #finds the codon (with freq 1) that produces the new mutated aa

mut_df = mut_df.sort_values(['lib_i','pos_aa','mut_aa']).reset_index(drop=True)
mut_df
#Table cons

Unnamed: 0,lib_i,wt_aa,pos_aa,mut_aa,start_nt,stop_nt,wt_nt,mut_nt
0,0,Q,0,P,0,3,CAA,CCG
1,0,Q,0,V,0,3,CAA,GTT
2,0,S,1,P,3,6,AGC,CCG
3,0,S,1,V,3,6,AGC,GTT
4,0,P,2,A,6,9,CCA,GCT
...,...,...,...,...,...,...,...,...
1615,1,S,535,G,1605,1608,TCC,GGT
1616,1,C,536,G,1608,1611,TGT,GGT
1617,1,A,537,G,1611,1614,GCT,GGT
1618,1,G,538,A,1614,1617,GGA,GCT


##Precalculate All Primers

In [None]:
# PARAM: primer lengths (inclusive)
primer_lmin, primer_lmax = 18, 30

# PARAM: pcr conditions (temp impacts dg calc / whether structure is found)
pcr = p3.thermoanalysis.ThermoAnalysis(dna_conc= 250,
                                       mv_conc= 50,
                                       dv_conc= 0,
                                       dntp_conc= 0,
                                       tm_method= 'santalucia',
                                       salt_correction_method= 'owczarzy',
                                       temp_c= 25)

def calcOffTarget(primer, seq, start):
  fl,fr = seq[:primer.start-start], seq[primer.stop-start:]
  rl,rr = revcomp(fl), revcomp(fr)

  res_fl = pcr.calcHeterodimer(primer.seq, fl).todict()
  res_fr = pcr.calcHeterodimer(primer.seq, fr).todict()
  res_rl = pcr.calcHeterodimer(primer.seq, rl).todict()
  res_rr = pcr.calcHeterodimer(primer.seq, rr).todict()

  ot_tm = max(res_fl['tm'], res_fr['tm'], res_rl['tm'], res_rr['tm'])
  # ot_dg = min(res_fl['dg'], res_fr['dg'], res_rl['dg'], res_rr['dg'])*1e-3
  return ot_tm

def n_subsequences(sequence, lmin, lmax):
  print(sum(len(sequence) - l + 1 for l in range(lmin, lmax+1)))

def subsequences(sequence, lmin, lmax): #Generates all subsequences w/ all poss. start-stop pairs
  ls = []
  for j in range(lmin, lmax+1): #length
    for i in range(len(sequence)-j+1): #starting index
      start = i
      stop = i+j
      ls.append([sequence[start:stop], start, stop, stop-start])
  return pd.DataFrame(ls, columns=['seq','start','stop','len'])

# convention: start index of r-primers will be 3' (i.e. start < stop)
primer_f = pd.DataFrame(columns=['seq','start','stop','fr','len'])
primer_f[['seq','start','stop','len']] = subsequences(sequence_nt, primer_lmin, primer_lmax)
primer_f['fr'] = 'f'

#Shifting so that 0 is at the start of mutreg (upstream has negative values)
primer_f['start'] = primer_f.start - mutreg_start
primer_f['stop'] = primer_f.stop - mutreg_start

#Creating reverse primers at same locations
primer_r = primer_f[['seq','start','stop','fr','len']].copy()
primer_r['fr'] = 'r'
primer_r['seq'] = primer_r.seq.apply(revcomp)

#Concatenating Forward & Reverse
primer_df = pd.concat([primer_f,primer_r])
primer_df.sort_values(by=['start','stop','fr'], inplace=True)

#Calculating "Cost" Values
primer_df['gc'] = primer_df.seq.apply(GC)
primer_df['tm'] = primer_df.seq.apply(pcr.calcTm)
res = primer_df.seq.parallel_apply(lambda s: pcr.calcHairpin(s).todict())
primer_df['hp_tm'] = res.apply(lambda res: res['tm'])
primer_df['hp_dg'] = res.apply(lambda res: res['dg']*1e-3)
res = primer_df.seq.parallel_apply(lambda s: pcr.calcHomodimer(s).todict())
primer_df['ho_tm'] = res.apply(lambda res: res['tm'])
primer_df['ho_dg'] = res.apply(lambda res: res['dg']*1e-3)

# pl.initialize(progress_bar=True)
# tstart2 = time.time()
# primer_df['ot_tm'] = primer_df.parallel_apply(lambda p: calcOffTarget(p, sequence_nt, -mutreg_start), axis=1)
# print(time.time()-tstart2)

primer_df.columns.to_list()

  return lib.map_infer(values, mapper, convert=convert)
  res = primer_df.seq.parallel_apply(lambda s: pcr.calcHairpin(s).todict())
  res = primer_df.seq.parallel_apply(lambda s: pcr.calcHomodimer(s).todict())


['seq',
 'start',
 'stop',
 'fr',
 'len',
 'gc',
 'tm',
 'hp_tm',
 'hp_dg',
 'ho_tm',
 'ho_dg']

In [None]:
# PARAM: primer cost function
def primer_cost(primer):
  tm_min = 58
  hp_dg_max = -5
  ho_dg_max = -5

  tm_cost = max(0, tm_min-primer.tm)**1.5
  gc_cost = 0
  hp_cost = max(0, hp_dg_max - primer.hp_dg)**1.2
  ho_cost = max(0, ho_dg_max - primer.ho_dg)**1.2
  len_cost = primer.len*1e-5  # tiebreaker (nudge towards shorter primers)

  cost = tm_cost + gc_cost + hp_cost + ho_cost + len_cost
  return cost


primer_df['cost'] = primer_df.parallel_apply(primer_cost, axis=1)
primer_df['log10cost'] = primer_df.cost.apply(np.log10)

primer_df.reset_index(inplace=True)
primer_f = primer_df.query('fr=="f"').reset_index(drop=True)
primer_r = primer_df.query('fr=="r"').reset_index(drop=True)
primer_df.set_index(['start','stop','fr'], inplace=True)
primer_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,seq,len,gc,tm,hp_tm,hp_dg,ho_tm,ho_dg,cost,log10cost
start,stop,fr,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
-150,-132,f,0,ATTTGAATGTATTTAGAA,18,16.7,31.6,0.0,0.0,-53.9,-1.7,135.6,2.1e+00
-150,-132,r,0,TTCTAAATACATTCAAAT,18,16.7,31.6,0.0,0.0,-85.4,-0.7,135.6,2.1e+00
-150,-131,f,1963,ATTTGAATGTATTTAGAAA,19,15.8,33.0,0.0,0.0,-73.8,-1.5,125.3,2.1e+00
-150,-131,r,1963,TTTCTAAATACATTCAAAT,19,15.8,33.0,0.0,0.0,-94.2,-0.7,125.3,2.1e+00
-150,-130,f,3925,ATTTGAATGTATTTAGAAAA,20,15.0,34.2,0.0,0.0,-70.5,-1.7,116.3,2.1e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1811,1829,r,1961,GTGGGCCAGGGCACGGGC,18,83.3,67.2,51.2,-2.5,22.3,-9.2,5.7,7.5e-01
1811,1830,f,3924,GCCCGTGCCCTGGCCCACC,19,84.2,69.5,38.3,-1.0,7.0,-6.2,1.2,8.2e-02
1811,1830,r,3924,GGTGGGCCAGGGCACGGGC,19,84.2,69.5,51.2,-2.5,22.3,-9.2,5.7,7.5e-01
1812,1830,f,1962,CCCGTGCCCTGGCCCACC,18,83.3,66.8,38.9,-0.9,7.0,-6.2,1.2,8.2e-02


#Algorithm


##Create Graph

In [None]:
overlap_lmin,overlap_lmax = 45,50
oligo_lmin,oligo_lmax = 195,205
primer_lmin, primer_lmax = 18,30

In [None]:
class Primer:
  def __init__(self,start,stop,is_r=False):
    assert start < stop
    self.start = start
    self.stop = stop
    self.is_r = is_r #forward or reverse
    self.l = stop-start #length
    self.w = primer_df.at[self.tup(),'cost'] #total cost value; the fancy notation is b/c
                                             #of the hierarchal lookup system in panda.df

  def __str__(self):
    return ' '.join(map(str,(self.start, self.stop, self.is_r)))
  def __repr__(self):
    return f'{("r" if self.is_r else"f")}({self.start},{self.stop})'
  def tup(self):
    return (self.start,self.stop,("r" if self.is_r else"f"))


def actions(primer): #returning possible counterparts (forward -> reverse; reverse -> forward)
                     #i.e. this method gets the "neighbors"
  if not primer.is_r:  # fwd
    for oligo_l, primer_l in it.product(reversed(range(oligo_lmin, oligo_lmax+1)),
                                        range(primer_lmin,primer_lmax+1)):

      stop = primer.start + oligo_l
      start = stop - primer_l
      yield Primer(start, stop, is_r=True)

  elif primer.is_r:  # rev
    for overlap_l, primer_l in it.product(reversed(range(overlap_lmin, overlap_lmax+1)),
                                          range(primer_lmin,primer_lmax+1)):

      start = primer.stop - overlap_l
      stop = start + primer_l

      # filter
      no_split = (primer.start - stop) >= primer.start%3
      if (stop > primer.start) or (not no_split): ## redundant to check first condition?
        continue
      yield Primer(start, stop)

In [None]:
def dfs(primer): #CREATING the graph
  if (primer.start >= mutreg_l) and primer.is_r:  # base case (end)
    G.add_edge(primer.tup(),'d', weight=0.) #G is global variable defined in next section
    return

  for next_primer in actions(primer):
    is_new = not G.has_node(next_primer.tup())
    G.add_edge(primer.tup(),next_primer.tup(), weight=next_primer.w) #weight is the cost of the new primer
    if is_new:
      dfs(next_primer)

def paths_ct(G, u, d): #total # of paths between two points
    if u == d:
        return 1
    else:
        if not G.nodes[u]: #npaths attribute is the # of paths out of u
            G.nodes[u]['npaths'] = sum(paths_ct(G, c, d) for c in G.successors(u))
        return G.nodes[u]['npaths']

In [None]:
#Takes ~2 minutes
G = nx.DiGraph()

# primers_init = [Primer(p[0],p[1]) for p in primer_df.query('fr=="f" and stop<=0').cost.groupby(level=0).idxmin()]  ## best primers for given start
primers_init = [Primer(p.start,p.stop) for _,p in primer_f.query('stop<=0')[['start','stop']].iterrows()]  ## all forward primers upstream
for primer in primers_init:
  G.add_edge('s',primer.tup(), weight=primer.w) #intializing the s-primer connection
  dfs(primer) #create the rest of the graph

print(f'nodes: {len(G.nodes)}')
print(f'edges: {len(G.edges)}')
print(f"paths: {paths_ct(G,'s','d'):.2e}")

nodes: 45293
edges: 3911721
paths: 4.37e+45


##Creating Gurobipy Solver

In [None]:
params = {
"WLSACCESSID": "7d564052-fa88-4170-8367-f26ba5820051",
"WLSSECRET": "a37ba3fa-edcd-4bdf-987d-1ba70d1086f1",
"LICENSEID": 2423852,
}
env = gp.Env(params=params)

# Create the model within the Gurobi environment
model = gp.Model('min-sum', env=env)

Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2423852
Academic license - for non-commercial use only - registered to mathwiz108@gmail.com


##Parameters & Variables

In [None]:
num_proteins = 10
allowed_overlap = 6

#+1 is to make it inclusive
nt_range = (-len(upstream_nt), len(mutreg_nt) + len(downstream_nt)+1) #range of nucleotides
l_range = (allowed_overlap+1, primer_lmax+1)

In [None]:
#file_name = f"drive/MyDrive/graph_mini{i+1}.bin"
graph_edges = G.edges(data=True)
graph_nodes = [node for node in G.nodes if node != 's' and node != 'd'] #removing s & d nodes

##Graph --> Lists

In [None]:
def create_bins(): #bins take the form (start, end, [])
  all_bins = {(start, start+len, c):[] for start in range(*nt_range) for len in range(*l_range) for c in ('f', 'r')}

  for node in graph_nodes: #for each node, add it into all the necessary bins
    for bin_start in range(node[0], node[1]):
      for bin_length in range(*l_range):
        if bin_start + bin_length > node[1]:
          break
        else:
          all_bins[(bin_start, bin_start + bin_length, node[2])].append(node)

  all_bins = {key:val for key,val in all_bins.items() if val} #no empty bins
  return all_bins

In [None]:
all_bins = create_bins()
print("Number of Constraints:", len(all_bins))
print("Average Vars Per Constraint", 1/len(all_bins) * sum(len(val) for _, val in all_bins.items()))

Number of Constraints: 87746
Average Vars Per Constraint 101.60122398741822


In [None]:
#Converting Graphs to Lists
ij = gp.tuplelist()
w_ij = gp.tupledict()

for edge in graph_edges:
  l = (str(edge[0]), str(edge[1])) #i, j
  ij.append(l)
  w_ij[l] = edge[-1]['weight']

print("Finished Conversion")

Finished Conversion


##ILP Formulation

In [None]:
#Creating Variables
x = model.addVars(ij, obj=w_ij, vtype=gp.GRB.BINARY)
print("Finished Variable Creations")

Finished Variable Creations


In [None]:
#Intersection Constraints
for cnt, nodes in enumerate(all_bins.values()):
  all_edges = []
  if cnt % (len(all_bins)//25) == 0:
        print(int(cnt / len(all_bins) * 100))
  for node in nodes:
    all_edges.append(x.sum(str(node), '*'))
  model.addConstr(gp.quicksum(all_edges) <= 1)
print("Finished Intersection constraints!")

0
3


KeyboardInterrupt: ignored

In [None]:
#Single Path Constraints
for n in graph_nodes + ['s', 'd']: #adding s & d back just here
  v = str(n)
  model.addConstr(sum(x[i,j] for i,j in ij.select(v, '*')) - sum(x[j,i] for j,i in ij.select('*', v)) == (num_proteins if v=='s' else -1 * num_proteins if v=='d' else 0), v)

In [None]:
model.update()
print(f"Total Constraints: {model.numConstrs}")
print(f"Total Variables: {model.numVars}")

Total Constraints: 133039
Total Variables: 3911721


In [None]:
model.optimize()
print("finished optimization")

Gurobi Optimizer version 10.0.2 build v10.0.2rc0 (linux64)

CPU model: Intel(R) Xeon(R) CPU @ 2.20GHz, instruction set [SSE2|AVX|AVX2]
Thread count: 2 physical cores, 4 logical processors, using up to 4 threads

Academic license - for non-commercial use only - registered to marcusbl@mit.edu
Optimize a model with 133039 rows, 3911721 columns and 745496732 nonzeros
Model fingerprint: 0xe8df0133
Variable types: 0 continuous, 3911721 integer (3911721 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [2e-04, 2e+02]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+01]
Presolve removed 0 rows and 0 columns (presolve time = 45s) ...
Presolve removed 0 rows and 0 columns (presolve time = 51s) ...
Presolve removed 0 rows and 0 columns (presolve time = 56s) ...
Presolve removed 0 rows and 0 columns (presolve time = 118s) ...
Presolve removed 689 rows and 0 columns (presolve time = 122s) ...
Presolve removed 689 rows and 0 columns (presolve time = 

In [None]:
def post_processing(variables):
  all_proteins = [['s'] for _ in range(num_proteins)]
  true_edges = [index for index, var in x.items() if var.X != 0]

  while true_edges:
    edge = true_edges.pop(0)
    added_edge = False
    for protein_list in all_proteins:
      if edge[0] == protein_list[-1]:
        protein_list.append(edge[1])
        added_edge = True
        break
    if not added_edge:
      true_edges.append(edge)

  return all_proteins

actual_values = post_processing(x)
print(len(actual_values))
for cnt, vals in enumerate(actual_values):
  print(f"Protein #{cnt+1} ({len(vals)})")
  print(vals)
  print()

10
Protein #1 (26)
['s', "(-150, -131, 'f')", "(27, 45, 'r')", "(-1, 17, 'f')", "(175, 197, 'r')", "(147, 170, 'f')", "(333, 351, 'r')", "(301, 328, 'f')", "(487, 506, 'r')", "(460, 479, 'f')", "(641, 665, 'r')", "(617, 636, 'f')", "(795, 813, 'r')", "(766, 789, 'f')", "(945, 968, 'r')", "(918, 944, 'f')", "(1100, 1123, 'r')", "(1078, 1097, 'f')", "(1257, 1276, 'r')", "(1230, 1250, 'f')", "(1413, 1435, 'r')", "(1390, 1409, 'f')", "(1570, 1590, 'r')", "(1545, 1569, 'f')", "(1722, 1743, 'r')", 'd']

Protein #2 (26)
['s', "(-136, -115, 'f')", "(40, 59, 'r')", "(13, 31, 'f')", "(192, 212, 'r')", "(165, 183, 'f')", "(346, 370, 'r')", "(323, 343, 'f')", "(501, 519, 'r')", "(474, 492, 'f')", "(660, 679, 'r')", "(631, 650, 'f')", "(808, 835, 'r')", "(785, 805, 'f')", "(963, 986, 'r')", "(939, 959, 'f')", "(1118, 1138, 'r')", "(1092, 1111, 'f')", "(1271, 1290, 'r')", "(1245, 1264, 'f')", "(1431, 1449, 'r')", "(1404, 1424, 'f')", "(1589, 1609, 'r')", "(1564, 1582, 'f')", "(1742, 1760, 'r')", 'd'