In [80]:
import random

# edit settings
READ_LENGTH = 20
READ_DISTANCE = 10
ERROR_RATE = 0.05
SHIFT_MIN = 1
SHIFT_MAX = 5
SHUFFLE_READS = True
REF_LENGTH = 500
VARIANT_SIZE = random.randint(15, 30)
VARIANT_PADDING = 30
# 0: Deletion
# 1: Insertion
# 2: Tandem Duplication
# 3: Inversion
# 4: Translocation
MODE = 0

# edit save file name
readsPath = 'read'
anwserPath = 'anwser'
refPath = 'ref'

# Generate reference & variance
def generateReference(length):
    BASES = ['A','C','G','T']
    output = ''
    for i in range(length):
        output += random.choice(BASES)
    return output

ref = generateReference(REF_LENGTH)
index = random.randint(VARIANT_PADDING, len(ref)-VARIANT_PADDING-1)
anwser = ''
if MODE == 0:
    anwser += 'Deletion at index ' + str(index) + '\n'
    anwser += 'With size ' + str(VARIANT_SIZE) + '\n'
    anwser += ref[0:index] + ref[index+VARIANT_SIZE:]
elif MODE == 1:
    anwser += 'Insertion at index ' + str(index) + '\n'
    anwser += 'With size ' + str(VARIANT_SIZE) + '\n'
    anwser += ref[0:index] + generateReference(VARIANT_SIZE) + ref[index:]
elif MODE == 2:
    index = random.randint(VARIANT_PADDING+VARIANT_SIZE, len(ref)-VARIANT_PADDING-1)
    anwser += 'Tandem Duplication at index ' + str(index) + '\n'
    anwser += 'With size ' + str(VARIANT_SIZE) + '\n'
    anwser += ref[0:index] + ref[index-VARIANT_SIZE:index] + ref[index:]
elif MODE == 3:
    anwser += 'Inversion at index ' + str(index) + '\n'
    anwser += 'With size ' + str(VARIANT_SIZE) + '\n'
    anwser += ref[0:index] + ref[index:index+VARIANT_SIZE][::-1] + ref[index+VARIANT_SIZE:]
else:
    index = len(ref)-VARIANT_SIZE
    anwser += 'Translocation at index ' + str(index) + '\n'
    anwser += 'With size ' + str(VARIANT_SIZE) + '\n'
    anwser += ref[0:index] + generateReference(VARIANT_SIZE)


anwser += '\n' + ref + '\n'
count = 1
for i in range(max(len(anwser.split('\n')[2]),len(anwser.split('\n')[3]))):
    if (i%5 == 4):
        anwser+=str(count)
        count+=1
        count%=10
    else:
        anwser+=' '
anwser += '\n' + ' '*index + '^'
anwserSplit = anwser.split('\n')
anwser = '\n'.join([anwserSplit[0],anwserSplit[1],anwserSplit[4],anwserSplit[3],anwserSplit[2],anwserSplit[5]])

# Generate reads
reads = []
physical = anwser.split('\n')[4]

def generateRead(index):
    global reads
    global anwser
    NEG_BASES = {'A':['C','G','T'],'C':['A','G','T'],'G':['A','C','T'],'T':['A','C','G']}

    read1 = ''
    for i in range(index,index+READ_LENGTH):
        if random.random() < ERROR_RATE:
            read1 += random.choice(NEG_BASES[physical[i]])
        else:
            read1 += physical[i]

    read2 = ''
    for i in range(index+READ_LENGTH+READ_DISTANCE,index+READ_LENGTH*2+READ_DISTANCE):
        if random.random() < ERROR_RATE:
            read2 += random.choice(NEG_BASES[physical[i]])
        else:
            read2 += physical[i]
    
    reads.append([read1,read2])
    line = ' ' * index + read1 + ' ' * READ_DISTANCE + read2
    anwser += '\n' + line

index = 0
while index < len(physical) - 2*READ_LENGTH - READ_DISTANCE - 1:
    generateRead(index)
    index += random.randint(SHIFT_MIN, SHIFT_MAX)

index = len(physical) - 2*READ_LENGTH - READ_DISTANCE
generateRead(index)

if SHUFFLE_READS:
    random.shuffle(reads)

reads = '\n'.join(','.join(row) for row in reads)

print('-----------Reads-----------')
print(reads)
print('\n-----------Anwser-----------')
print(anwser)

# Save files
if readsPath:
    with open(readsPath+'.txt','w') as f:
        f.write(reads)
if anwserPath:
    with open(anwserPath+'.txt','w') as f:
        f.write(anwser)
if refPath:
    with open(refPath+'.txt','w') as f:
        f.write(ref)


-----------Reads-----------
TCATCTTGTTCCAAATGGAT,TCACTCTGTACACCGGTCTA
CATATCCAGAAGGGTCACCT,GGATCATCTGATTGCCGCTG
TTGCGCGCCTGCCTTTGATC,TCCAGACGGATCACCTTTGA
GAAAGCGTAAATGTTATGGC,GCCTACTCGGGACAGGCGAC
GCGTCTCGGTTGAGGTTGGG,TTTCATTTTCGTTTGTTCAA
TGAAGGCCGATGGGATCTAT,CCGAGCATGCCGTACGACTG
GCCGTACGACTGGAGCGATA,CAGACATGACGACAGTTTAG
GAGGATTAGGCGTCTCGGTT,ATCCGACTTTTTCACCTTCG
GATGCCCCAGACATGACGAA,CTTAGCCGATTTCACTACCG
GATCCGGAACCATATCCAGA,TTGAATTCTTGGATCATCTG
TGACGCGTGGGTATCGCACG,ATCTTGTTTCAAATGGATAC
ATCGGAGATATCTACGACTT,TGTAAATGATAGGGCCGACG
GTCTTGGCCTACGCGGGACA,TCCGGAGGATTAGGCGTCTC
TCCATCGGAGATATCAACGA,AAGCGTAAATGGTAGGACCG
CGACGTCTAGGCCTACTCGG,CTGATCCGGAGGATTAGGCG
TATTTAAGTGACACTGTACA,GGAGATATCTTCGACTTAAC
AGCATGCCGTACGACTGCAG,TGCCTCAGACATGACGACAG
GTACTATATAAGTGACTCCG,CATCGGAGATATCTTCGAGG
TGAGACGTCAAGCTTGCGCG,ATCCGGAACCATATCCAGAC
TCCGGAGGATTAGGCGTCTC,AGCGATCCGACTTTTTCACT
CGACCGTTCCTATTGAGACG,GCGCCTGCCTTTTATCCGGA
CAAGCGACCGAGTCCGGAGG,TCGGTTCAGGTTGGGGATCC
GTTCCTTAGCCGATTTCACT,TTACCCTGTCTTCTGTAGCG
TCTTCG