In [None]:
import pandas as pd
import numpy as np
import time

In [None]:
%%capture text
#This hides output
!wget http://webhotel4.ruc.dk/~keld/research/LKH-3/LKH-3.0.7.tgz
!tar xvfz LKH-3.0.7.tgz;
!cd LKH-3.0.7; make clean; make;


# Initialization
### The best initial string I found was the known 5913 superpermutation (after relabeling).
### Here the mandatory '12ABCDE' permutations appear more uniformly than in the 5906 superpermutation.


In [None]:
def is_permutation(s):
    return len(set(s))==len(s)

def get_permutations(s, length = 7):
    answer = []
    for i in range(len(s)-length+1):
        x = s[i:i+length]
        if is_permutation(x) and x not in answer:
            answer.append(x)
    return answer

def make_double(old_list,new_char):
    length = len(old_list[0])
    new_list = []
    for x in old_list:
        y = x+new_char+x
        local_list = get_permutations(y, length+1)
        for z in local_list:
            if z not in new_list:
                new_list.append(z)
    return new_list

abc = '3456712'
all_permutations = ['']

for char in abc:
    all_permutations = make_double(all_permutations, char)
mandatory = []

for x in all_permutations:
    if x[:2]=='12':
        mandatory.append(x)
        
print(len(all_permutations), len(mandatory))

In [None]:
def distance(s1, s2): 
    #This also works later for sequences that contain wildcards if the wildcard is not in the first 6 or in the last six digit of s1 or s2
    j = 7
    for k in range(0,7):
        t = 7-k #length of possible overlap
        if s1[len(s1)-t:] == s2[:t]: # Checking if the length t part of the end of s1 agrees with the beginning of s2
            j=k
            break
    return j

def total_distance(list_of_permutations):
    answer = 7
    for i in range(len(list_of_permutations)-1):
        x = list_of_permutations[i]
        y = list_of_permutations[i+1]
        answer+= distance(x,y)
    return answer

total_distance(all_permutations)
def permutations_to_string(list_of_permutations):
    answer = list_of_permutations[0]
    for i in range(len(list_of_permutations)-1):
        x = list_of_permutations[i]
        y = list_of_permutations[i+1]
        d = distance(x,y)
        answer+=y[7-d:]
    return answer

superpermutation = permutations_to_string(all_permutations)
print(len(superpermutation), total_distance(all_permutations))
print(len(get_permutations(superpermutation)))

In [None]:
#Checking uniform distribution of mandatory permutations
M1, M2, M3 = [], [], []
for x in mandatory:
    if x in all_permutations[:1680]:
        M1.append(x)
    elif x in all_permutations[1680:3360]:
        M2.append(x)
    elif x in all_permutations[3360:]:
        M3.append(x)
print(len(M1), len(M2), len(M3))

In [None]:
group1 = list(set(all_permutations[:1680]+mandatory))
group1.sort()
group2 = list(set(all_permutations[1680:3360]+mandatory))
group2.sort()
group3 = list(set(all_permutations[3360:]+mandatory))
group3.sort()
print(len(group1), len(group2), len(group3))
print('We will use these groups in the next step')

# Finding 2480
### Here we use the LKH package as explained in Chris Deotte's notebook: <a href="https://www.kaggle.com/cdeotte/santa-2021-tsp-baseline-2500">Santa 2021 TSP Baseline - [2500] by CHRIS DEOTTE</a>

### There is a small change in how the final string is computed:
### The solver gives a tournament solution p(1),p(2),...,p(n).
### If distance(p(n), p(1)) is less then 7, one can usually find another p(t) with distance(p(t),p(t+1)) =7.
### Then glue p(t+1), p(t+2),..., p(t-1), p(t) to shorten the length.
### An equivalent approach would be to use a dummy node.

In [None]:
# The function used in Chris Deotte's notebook with small changes
def get_tsp_solution(group, seed = 1, time = 100):
    
    # CREATE DISTANCE MATRIX
    SIZE = len(group)
    M = np.zeros((SIZE, SIZE), dtype='int8')
    for j in range(SIZE):
        #if j%25==0: print(j,', ',end='')
        for k in range(SIZE):
            M[j,k] = distance(group[j],group[k])
            
    # WRITE PROBLEM FILE
    f = open(f'group.par','w')
    f.write("PROBLEM_FILE = ../distances.atsp\n")
    f.write("TOUR_FILE = ../output.txt\n")
    f.write(f"OPTIMUM = {SIZE}\n")
    f.write("MOVE_TYPE = 5\n")
    f.write("PATCHING_C = 3\n")
    f.write("PATCHING_A = 2\n")
    
    f.write("SEED = "+str(seed)+"\n")
    f.write("RUNS = 1\n")
    f.write("TIME_LIMIT = "+str(time)+"\n") #seconds
    f.close()
    
    # WRITE PARAMETER FILE
    f = open(f'distances.atsp','w')
    f.write("NAME: distances\n")
    f.write("TYPE: ATSP\n")
    f.write("COMMENT: Asymmetric TSP\n")
    f.write(f"DIMENSION: {SIZE}\n")
    f.write("EDGE_WEIGHT_TYPE: EXPLICIT\n")
    f.write("EDGE_WEIGHT_FORMAT: FULL_MATRIX\n")
    f.write("EDGE_WEIGHT_SECTION\n")
    for j in range(SIZE):
        #if j%25==0: print(j,', ',end='')
        for k in range(SIZE):
            f.write(f"{M[j,k]:2d} ") 
        f.write("\n")
    f.close()
    
    # EXECUTE TSP SOLVER
    !cd LKH-3.0.7; ./LKH ../group.par
    
    # READ RESULTING ORDER
    with open('output.txt') as f:
        lines = f.readlines()
    for i,ln in enumerate(lines):
        if 'TOUR_SECTION' in ln: break
    perms = [int(x[:-1]) for x in lines[i+1:-2] ]
    
    
    best_d = 0
    best_start = 0
    for k in range(len(perms)):
        t1 = (k+1)%(len(perms))
        s1 = group[perms[k]-1]
        s2 = group[perms[t1]-1]
        d  = distance(s1,s2)
        if d > best_d:
            best_d = d
            best_start=t1
    result = group[ perms[best_start]-1 ]
    for k in range(len(perms)-1):
        ind1 = (best_start+k)%(len(perms))
        ind2 = (ind1+1)%(len(perms))
        s1 = group[ perms[ind1]-1 ]
        s2 = group[ perms[ind2]-1 ]
        d = distance(s1,s2)
        assert(d!=0)
        result += s2[7-d:]
    
    return result

In [None]:
def get_best_solution(group, number_of_tries, time, desired_length):
    best_string = ''.join(x for x in group)
    for i in range(1,number_of_tries +1):
        print("Seed number = " + str(i))
        new_string = get_tsp_solution(group, seed = i, time=time)
        print("Length of String = " + str(len(new_string)))
        if len(new_string) < len(best_string):
            best_string = new_string
            if len(best_string) <= desired_length:
                break
    return best_string

In [None]:
%%capture step1
## Using the groups of permutations we found in the initialization step
begin = time.time()
string1 = get_best_solution(group1, number_of_tries = 15, time = 50, desired_length = 2480)
string2 = get_best_solution(group2, number_of_tries = 15, time = 50, desired_length = 2480)
string3 = get_best_solution(group3, number_of_tries = 15, time = 50, desired_length = 2480)
end = time.time()

In [None]:
with open('output_step1.txt', 'w') as f:
    f.write(step1.stdout)

In [None]:
print('Time to find the three strings = '+ str(int(end-begin)) + ' seconds.')
print(len(string1), len(string2), len(string3))

In [None]:
def is_good_solution(s1,s2,s3):
    g1 = get_permutations(s1)
    g2 = get_permutations(s2)
    g3 = get_permutations(s3)
    
    for x in mandatory:
        if x not in g1 or x not in g2 or x not in g3:
            print("Some mandatory permutations are missing")
            return False
    
    if len(set(g1+g2+g3)) != 5040:
        t = len(set(g1+g2+g3))
        print('Missing '+str(5040-t)+' permutations')
        return False
              
    return True 

print('Checking the solution!')

value = is_good_solution(string1, string2, string3)

print('Solution is correct = ' + str(value))

# Finding 2440 (no wildcards)
### Observation: There are overlaps between the permutations covered by string1, string2 and string3.
### Fix two of the strings and get an improved Travelling Salesman Problem to replace the third.
### Then iterate.
### We get lucky (due to the symmetric initial strings), and this gives an optimal "no wildcard" solution in just three iterations

In [None]:
def make_new_group(s1, s2, s3): 
    found_permutations = get_permutations(s2)+get_permutations(s3)
    new_group = []
    for x in all_permutations:
        if x in mandatory or (x not in found_permutations):
            new_group.append(x)
    return new_group

def improve_string(s1, s2, s3, number_of_tries, time, desired_length):
    group = make_new_group(s1, s2, s3)
    return get_best_solution(group, number_of_tries, time, desired_length)

n_old = len(get_permutations(string1))
n_new = len(make_new_group(string1, string2, string3))

print('Number of permutations in string1 = '+  str(n_old))
print('Number of permutations for the new TSP problem to shorten string1 = ' +str(n_new))

In [None]:
%%capture step2
begin = time.time()
string1A = improve_string(string1, string2, string3, number_of_tries = 5, time = 50, desired_length = 2440)
string2A = improve_string(string2, string3, string1A, number_of_tries = 5, time = 50, desired_length = 2440)
string3A = improve_string(string3, string1A, string2A, number_of_tries = 5, time = 50, desired_length = 2440)
end = time.time()

In [None]:
with open('output_step2.txt', 'w') as f:
    f.write(step2.stdout)

In [None]:
print('Time to find the three strings = '+ str(int(end-begin)) + ' seconds.')
print(len(string1A), len(string2A), len(string3A))
print('Checking the solution!')
value = is_good_solution(string1A, string2A, string3A)
print('Solution is correct = ' + str(value))

# Finding patterns

## Let's investigate the 2440 (no wildcard) solution. We can make two observations:

### Substrings starting at a mandatory '12ABCDE' and ending before the next mandatory always have length 7 or 47.

### These 47 long subsequences are very similar and most of them have the form

### 12ABCDE21ABCDE2A1BCDE2AB1CDE2ABC1DE2ABCD1E2ABCD

In [None]:
positions_of_mandatory = []
for i in range(len(string1A)-1):
    if string1A[i:i+2] == '12':
        positions_of_mandatory.append(i)
print(positions_of_mandatory) 

In [None]:
def finding_gaps(s):
    gaps = []
    positions = []
    for i in range(len(s)-1):
        if s[i:i+2] == '12':
            positions.append(i)
    for i in range(len(positions)-1):
        pos1 = positions[i]
        pos2 = positions[i+1]
        gaps.append(pos2-pos1)
    return gaps
print(finding_gaps(string1A))
print(finding_gaps(string2A))
print(finding_gaps(string3A))

In [None]:
print('We have only gaps of length 7 and 47')

Long_Strings = []
for i in range(len(positions_of_mandatory)):
    index1 = positions_of_mandatory[i]
    if i < len(positions_of_mandatory)-1:
        index2 = positions_of_mandatory[i+1]
    else: 
        index2 = len(positions_of_mandatory)
    if index2-index1==47:
        Long_Strings.append(string1A[index1:index2])
        
print('Number of Long strings :',len(Long_Strings))

Patterns = {}
for p in Long_Strings:
    Dict = {p[2]:'A', p[3]:'B', p[4]:'C', p[5]:'D', p[6]:'E'}
    for k in Dict:
        p = p.replace(k, Dict[k])
    if p not in Patterns:
        Patterns[p] = 1
    else:
        Patterns[p] +=1
print('Frequency of patterns of Long strings', Patterns)
List = []
for p in Patterns:
    List.append((Patterns[p],p))
List.sort()
pattern = List[-1][1]
print('The most common pattern is :', pattern) 

In [None]:
print('Do we have similar patterns in the other strings as well?\n')

def get_long_string(s): # s is a length 5 permutation:
    typical_pattern ='12ABCDE21ABCDE2A1BCDE2AB1CDE2ABC1DE2ABCD1E2ABCD'
    abc = 'ABCDE'
    for i in range(5):
        typical_pattern = typical_pattern.replace(abc[i], s[i])
    return typical_pattern

print('Long string for length 5 permutation 34567 = ' + get_long_string('34567')+'\n')

P1, P2, P3 = [], [], []
for x in mandatory:
    y = x[2:] # a length 5 permutation on 3,4,5,6,7
    z = get_long_string(y)
    if z in string1A:
        P1.append(y)
    elif z in string2A:
        P2.append(y)
    elif z in string3A:
        P3.append(y)
        
print('Number of long substrings found : ',  len(P1), len(P2), len(P3))

number_of_same_patterns = len(P1)+len(P2)+len(P3)

print('\nOut of all the 120 length-47-substrings we have '+ str(number_of_same_patterns) +' of the same pattern' )

# Using the wildcards
### The idea is to replace a long string (length 47) in some stringX (length 2440)
### '12ABCDE21ABCDE2A1BCDE2AB1CDE2ABC1DE2ABCD1E2ABCD' with a 48 long sequence
### '12ABCDE21ABCDE2A1BCDE2AB1CDE2ABC1DE2ABCD182ABCED',
### then delete '12ABCED' from stringX.
### For every wildcard this would add 1 character and delete 7, but some permutations might go missing.
### This seems to work better if the long string starting with'12ABCED' is in stringY or in stringZ and we use the same modifications for them.

In [None]:
def transpose(s):
    return s[:3]+s[4]+s[3]
print('Transpose of ABCDE = ' +transpose('ABCDE')+'\n')
print('We look for some length 5 permutations for ABCDE. We need 2 for each 2440 string.\n' )
P12 = []
P21 = []
P13 = []
P31 = []
P23 = []
P32 = []
for x in P1:
    if transpose(x) in P2:
        P12.append(x)
        P21.append(transpose(x))
        
for x in P1:
    if transpose(x) in P3:
        P13.append(x)
        P31.append(transpose(x))
        
for x in P2:
    if transpose(x) in P3:
        P23.append(x)
        P32.append(transpose(x))
    
print('Length of the groups : ', len(P12), len(P13), len(P23))
print('Given integers a, b, c we will use  wildcards for P12[a], P21[a], P13[b], P31[b], P23[c], P32[c]')
print('We start with the choices a = 0, b = 0, c = 0')

In [None]:
def get_wildcard_long_string(s):
    return get_long_string(s)[:40]+'182'+transpose(s)
print('Wildcard string for 34567 : ', get_wildcard_long_string('34567'))

def use_wildcard(s, w1, w2): #s is the 2440 long string, w1, w2 are length 5 permutations on '12345'
    wild = ''+s
    wild = wild.replace(get_long_string(w1), get_wildcard_long_string(w1))
    wild = wild.replace(get_long_string(w2), get_wildcard_long_string(w2))
    wild = wild.replace('12'+transpose(w1), '')
    wild = wild.replace('12'+transpose(w2), '')
    return wild


wild_string1 = use_wildcard(string1A, P12[0], P13[0])
wild_string2 = use_wildcard(string2A, P21[0], P23[0])
wild_string3 = use_wildcard(string3A, P31[0], P32[0])
print('\n'+'The shortened lengths are : ', len(wild_string1), len(wild_string2), len(wild_string3))

print('\n'+'We have the right length but some permutations are missing\n')

def get_permutations_wild(s):
    answer = []
    for i in range(len(s)-6):
        x = s[i:i+7]
        if '8' not in x:
            if is_permutation(x) and x not in answer:
                answer.append(x)
        if '8' in x:
            for i in '1234567':
                y = ''+x
                y = y.replace('8', i)
                if is_permutation(y) and y not in answer:
                    answer.append(y)
    return answer

def is_good_solution_wild(s1,s2,s3):
    g1 = get_permutations_wild(s1)
    g2 = get_permutations_wild(s2)
    g3 = get_permutations_wild(s3)
  
    for x in mandatory:
        if x not in g1 or x not in g2 or x not in g3:
            print("Some mandatory permutations are missing")
            return False
    if len(set(g1+g2+g3)) != 5040:
        t = len(set(g1+g2+g3))
        print('Missing '+str(5040-t)+' permutations') 
        return  False
        
    return True 

is_good_solution_wild(wild_string1, wild_string2, wild_string3);

In [None]:
print('We now find a, b, c with the smallest number of permutations missing')
best_params = (0,0,0)
smallest_missing = 100

for a in range(len(P12)):
    for b in range(len(P13)):
        for c in range(len(P23)):
            s1 = use_wildcard(string1A, P12[a], P13[b])
            s2 = use_wildcard(string2A, P21[a], P23[c])
            s3 = use_wildcard(string3A, P31[b], P32[c])
            if len(s1) != 2428 or len(s2) != 2428 or len(s3)!= 2428:
                print('Problem')
            g1 = get_permutations_wild(s1)
            g2 = get_permutations_wild(s2)
            g3 = get_permutations_wild(s3)
            missing = 5040-len(set(g1+g2+g3))
            if missing < smallest_missing:
                print(missing)
                smallest_missing = missing
                best_params = (a,b,c)  
print('Best parameters are ', best_params)
print('Still missing', smallest_missing, 'permutations')

In [None]:
print('We construct the corresponding strings.')
print('These will be modified later to get a solution.')
a, b, c = best_params
wild_string1 = use_wildcard(string1A, P12[a], P13[b])
wild_string2 = use_wildcard(string2A, P21[a], P23[c])
wild_string3 = use_wildcard(string3A, P31[b], P32[c])

# Finding 2429
### We use the same ideas as earlier: Fix two of the wild strings, wild_stringX, wild_stringY and modify wild_stringZ to cover the missing permutations.
### Here we fix the neighborhoods of the wildcards (a length 13 substring where the wildcard is in the middle).
### In that way we can use the same TSP program with minimal modifications.
### There are 3 options for Z. (In this example wild_string1 -> wild_string1B gives the shortest solution).
### Note that using a different version of the LKH program (of different seed numbers) may give you different answers for this part.

In [None]:
def make_new_group_wild(s1, s2, s3): # Assuming that there are two wildcards in s1 and they are at least 13 units apart
    found_permutations = get_permutations_wild(s2)+get_permutations_wild(s3)
    t1 = s1.find('8')
    t2 = s1[t1+1:].find('8')+t1+1
    assert(t1 >=  6 and t2 <= len(s1)-7 and t2-t1 >= 13)
    wild1 = s1[t1-6:t1+7]
    wild2 = s1[t2-6:t2+7]
    new_group = [wild1, wild2]
    extra_permutations = get_permutations_wild(wild1)+get_permutations_wild(wild2)
    for x in all_permutations:
        if x in extra_permutations:
            continue
        elif x in mandatory or x not in found_permutations:
            new_group.append(x)
    return new_group

def improve_string_wild(s1, s2, s3, number_of_tries, time, desired_length = 2428):
    group = make_new_group_wild(s1, s2, s3)
    return get_best_solution(group, number_of_tries, time, desired_length)

In [None]:
%%capture step3
begin = time.time()
wild_string1B = improve_string_wild(wild_string1, wild_string2, wild_string3, 2, 200, desired_length = 2428)
wild_string2B = improve_string_wild(wild_string2, wild_string3, wild_string1, 2, 200, desired_length = 2428)
wild_string3B = improve_string_wild(wild_string3, wild_string1, wild_string2, 2, 200, desired_length = 2428)
end = time.time()

In [None]:
with open('output_step3.txt', 'w') as f:
    f.write(step3.stdout)

In [None]:
print('Time to find the three candidate strings = '+ str(int(end-begin)) + ' seconds.')
print(len(wild_string1B), len(wild_string2B), len(wild_string3B))
print('We use the shortest, together with two of the length 2428 strings')
t1 = len(wild_string1B)
t2 = len(wild_string2B)
t3 = len(wild_string3B)
if t1 <= t2 and t1 <= t3:
    wild_string1C = wild_string1B
    wild_string2C = wild_string2
    wild_string3C = wild_string3
elif t2 <=t1 and t2 <=t3:
    wild_string1C = wild_string2B
    wild_string2C = wild_string1
    wild_string3C = wild_string3
else:
    wild_string1C = wild_string3B
    wild_string2C = wild_string1
    wild_string3C = wild_string2             

In [None]:
print('Checking the solution!\n')
value = is_good_solution_wild(wild_string1C, wild_string2C, wild_string3C)
print('Solution is correct = ' + str(value)+'\n')
print('The lengths are : ',len(wild_string1C), len(wild_string2C), len(wild_string3C))

# Finding 2428
### Manually shorten the longest string to have only the usual substrings between mandatory permutations.
### This might deletes some permutations, and these can be moved to one of the other groups.
### Note that this last part might be different if you get a longer solution in the previous step (in that case you may need to iterate this approach more than once)

In [None]:
Gap = 0
for i in range(len(wild_string1C)-10):
    if wild_string1C[i:i+2]=='12' and wild_string1C[i+8:i+10]=='12':
        print('Found a length 8 gap starting at :', i)
        Gap =i
print(Gap)

In [None]:
print(wild_string1C[Gap:Gap+15], 'Split into 2 mandatory and a digit :', wild_string1C[Gap:Gap+7], wild_string1C[Gap+7], wild_string1C[Gap+8:Gap+15]) 
print('Delete this digit to get a shorter sequence\n')
wild_string1D = wild_string1C[:Gap+7]+wild_string1C[Gap+8:]
print(len(wild_string1D), len(wild_string2C), len(wild_string3C))
print('\n'+'We again have the right length but some permutations are missing\n')
is_good_solution_wild(wild_string1D, wild_string2C, wild_string3C);
print('\nLuckily now there are only two missing permutations\n')
print('Iterating one last time in the next cell')

In [None]:
%%capture step4
begin = time.time()
wild_string2D = improve_string_wild(wild_string2C, wild_string1D, wild_string3C, 1, 50)
wild_string3D = improve_string_wild(wild_string3C, wild_string1D, wild_string2C, 1, 50)
end = time.time()

In [None]:
with open('output_step4.txt', 'w') as f:
    f.write(step4.stdout)

In [None]:
print('Time to calcutate the new strings  = '+ str(int(end-begin)) + ' seconds.')
print('We get two possible solutions')
print('Replacing the second string gives a triple with lengths :', len(wild_string1D), len(wild_string2D), len(wild_string3C))
print('Replacing the third string gives a triple with lengths :', len(wild_string1D), len(wild_string2C), len(wild_string3D))
if len(wild_string2D) <= len(wild_string3D):
    solution1 = wild_string1D
    solution2 = wild_string2D
    solution3 = wild_string3C
else:
    solution1 = wild_string1D
    solution2 = wild_string2C
    solution3 = wild_string3D
print('Lengths of the best triple found', len(solution1), len(solution2), len(solution3))
print('Checking the solution!')
value = is_good_solution_wild(solution1, solution2, solution3)
print('Solution is correct = ' + str(value))

In [None]:
len(solution1), len(solution2), len(solution3)

In [None]:
is_good_solution_wild(solution1, solution2, solution3)

# Write Submission CSV

In [None]:
# CONVERT NUMBERS TO EMOJIS
replace_dict = {
 '1': '🎅',
 '2': '🤶',
 '8': '🌟',
 '3': '🦌',
 '4': '🧝',
 '5': '🎄',
 '6': '🎁',
 '7': '🎀'}

for k,v in replace_dict.items():
    solution1 = solution1.replace(k, v)
    solution2 = solution2.replace(k, v)
    solution3 = solution3.replace(k, v)

In [None]:
# WRITE SUBMISSION CSV
submission = pd.DataFrame()
submission['schedule'] = [solution1, solution2, solution3]
submission.to_csv('submission.csv',index=False)
submission.head()