In [None]:
# Wisecaver et al. 2022
# Goal: Telomere searching in scaffolded assemblies of P. parvum isolates
# Author: A Pendleton
# Date: 2022-09-07

In [None]:
from Bio.Seq import Seq

______

# Goal
Can we extract the termini of scaffolds to find telomeric repeats? 

**Question**: Do we have end-end chromosomal assemblies for some of these scaffolds? 

______

## Input files

### Define genome fasta file and its index
The index file is generated through the command `samtools faidx <FASTA>` which will generate a `.fai` file that is simply the chromosomes/scaffolds and their lengths.

In [None]:
genomeFasta = '../../../figshare/scaffolded_assemblies/UTEX2797_scaffolds_v1.fasta'

#Index is simply <FASTA> + .fai
genomeFastaIndex = genomeFasta + '.fai'

______

# Step 1 - Run TRF
Tandem repeat finder will identify repeat stretches that we can then parse to identify canonical telomeric repeats. 

The TRF program simply needs the FASTA file to run. 

An example submit script on a slurm operating system for running the program is below:

#### Script usage:

    `sbatch run_TRF.sh <FASTA>` 

### Resulting `.dat` outfile from TRF below
This is for 12B1 scaffolded assembly v1.

In [None]:
trf_outFile = '../../../figshare/annotation/telomeres/UTEX2797_scaffolds_v1.fasta.2.7.7.80.10.50.500.dat'

In [None]:
#Check that this file exists
! ls {trf_outFile}

______

# Step 2 - Parse TRF output files

### Define possible telomere sequence
Based on literature, the haptophytic telomere sequence is the same as human:
    
    'TTAGGG'
    
But what the TRF program may flag is any combination of TTAGGG, such as TAGGGT (sliding one base pair at a time) or the reverse complement and its slidnig window (AATCCC, ATCCCA, etc.).

What we need to do is build the array of all possible telomeric sequences.

In [None]:
#Store forward strand based sequence of a telomere based on the lit
telomere_forward = 'TTAGGG'

#Get the reverse complement
telomere_reverse = Seq(telomere_forward).reverse_complement()

print('Forward sequence = ', telomere_forward)
print('Reverse complement of telomeric sequence = ', telomere_reverse)

#### Build array of all possible derivations of the above forward and reverse repeat sequences 

In [None]:
#Clear array to track all poss telomeric sequences
possible_telomeric_seqs = []

#Automatically write the first
possible_telomeric_seqs.append(telomere_forward)
possible_telomeric_seqs.append(telomere_reverse)

#Store variable = length of telomere sequence
seq_length = len(telomere_forward)

#Print out header for out-cells
print('#Possible Telomeric Sequences to Search For:')

#Loop through all possible combinations of telomeric repeats
for i in range(1, seq_length):
    print('i=',i)
    seq = telomere_forward[i:seq_length] + telomere_forward[0:i]
    
    #Store as a bioseq 
    forward = Seq(seq)
    
    #Get the reverse complement
    reverse = forward.reverse_complement()
    
    print(forward, reverse)
    #Append both to the possible telomeric sequence array
    possible_telomeric_seqs.append(forward)
    possible_telomeric_seqs.append(reverse)
    

## Store the total size of each scaffold in a dict
This way, we can identify the distance from the start/end of each scaffold that the possible telomeric repeat was identified. 

If we assume that the scaffolds were assembled correctly, and extend all the way to a chromosomal tip, then we would not want to consider an identified "telomeric repeat" that was found in the middle of the scaffold sequence. We would only want to look at repeats a given N bp from the end. The N here will be defined by the user.


#### Parse the FASTA index file to store maximum length of each scaffold

In [None]:
#Clear dict for storing max lengths, keys == scaffold IDs
scaffDict = {}

#Parse the index file
for line in open(genomeFastaIndex, 'r'):
    #Parse and strip line
    line = line.rstrip().split('\t')
    #Get scaff ID and max scaff length, store in dict
    scaffID = line[0]
    scaffLength = int(line[1])
    
    if scaffID not in scaffDict.keys():
        scaffDict[scaffID] = scaffLength
    
print('lengths of %i scaffolds stored in scaffDict' % len(scaffDict.keys()))

#### Define maximum distance from scaffold end acceptable for "telomere" repeat
Here, we want to see how many scaffolds have telomeres near their termini. For that purpose, we are defining that possible telomeric repeats should be within the last 2kb of each scaffold. If you wish to lower or extend this, this is the point in which you may do so. 

For the purposes of this manuscript, we never saw any this far from the end, they were all much cloesr. 

In [None]:
maxDistanceFromEnd = 2000

#### Parse the trf results .dat file

This step keeps in mind position of repeat from the ends of each scaffold, as defined above.

In [None]:
#Clear a dictionary to store information in
trfDict = {}

#Tracking lines, this helps to skip header lines
lineCount = 0 

for line in open(trf_outFile, 'r'):
    line=line.rstrip().split(' ')
    lineCount += 1
    
    
    #The header line is the first 8 lines, we want to skip those and start
    #.  parsing the data lines
    if lineCount <= 8:
        continue

    
    #Find if new sequence is being detailed in outfile
    if 'Sequence:' in line:
        scaffold = line[1]
        print('\n' '#Parsing scaffold - ', scaffold)
        #Check that we have the scaffold in our length dict, if not, then we skip
        if scaffold not in scaffDict.keys():
            print('ERROR - This scaffold is not in the length dictionary determined by processing the provided FASTA index (.fai) file.')
            print('All TRF results from this scaffold will be ignored...' + '\n')
            continue
        
        #Create key in dictionary
        trfDict[scaffold] = []
    
    #Double-Check that we have the scaffold in our length dict, if not, then we skip
    if scaffold not in scaffDict.keys():
        continue
    
    
    #Now extract if there is data in the line pertaining to a TRF hit
    # ... These lines will start with a numeric value
    value = line[0]
    if value.isnumeric() is False:
        continue
    
        
    #Get the repeat sequence
    repeat_block_seq = line[-2] #The sequence of the repeated block
    total_repeat_seq = line[-1] #Length (in bp) of the repeated block
    
    #FILTER ONE
    #Set hit = False as default
    hit = False
    
    #Check if its an identical hit to what's in the list of possible sequences in the array:
    if repeat_block_seq in possible_telomeric_seqs:
        #print('PERFECT SEQ MATCH TO - ', repeat_block_seq)
        hit = "Perfect"
    
    #Else, search for a one off or so
    else:
        for repeat in possible_telomeric_seqs:
            if str(repeat) in str(repeat_block_seq):
                #print('PARTIAL SEQ MATCH TO %s' % repeat)
                #print('TRF SEQ = ', repeat_block_seq)
                hit = "Partial" 
                
    #If hit is still false, then skip
    if hit is False:
        continue 
    
    #Only those that are Perfect or Partial hits to the telomeric strings are considered
    # going forward
    #Ok, now parse line since this is a line that has a TRF hit
    start, end = int(line[0]), int(line[1]) #start and end of the repeat, as chromosomal coordinates
    
    #Extract the max scaffold length from the scaffDict
    #  This is used to calculate the distance of the repeat from the end of the scaffold
    scaffold_length = scaffDict[scaffold] #Max length of the scaffold.
    
    
    #FILTER TWO
    #Check if the repeat is within N bp from the end (variable defined above as 'maxDistanceFromEnd')
    positionFromStart = start
    positionFromEnd = scaffold_length - end
    
    #Position from either end is the minimum of the pos from start and pos from end
    positionFromEitherEnd = min(positionFromStart, positionFromEnd)
    
    #calculate repeat length
    repeat_length = end-start
    
    #Skip if the hit is not within the stated min distance from either end of scaffold
    if positionFromEitherEnd > maxDistanceFromEnd:
        continue
 
    #Printing out statistics to the screen 
    if positionFromStart < maxDistanceFromEnd:
        placement = 'start'
        print('(%s match) position from %s = %i bp' % (hit, placement, min(positionFromStart,positionFromEnd)))
        print(repeat_block_seq, str(repeat_length) + 'bp')
    if positionFromEnd < maxDistanceFromEnd:
        placement = 'end'
        print('(%s match) position from %s = %i bp' % (hit, placement, min(positionFromStart,positionFromEnd)))  
        print(repeat_block_seq, str(repeat_length) + ' bp')
    
    #STORE HIT IN THE DICT FOR LATER
    #Create key if first time seeing hit in scaffold
    if scaffold not in trfDict.keys():
        trfDict[scaffold] = []

    #Now add in the information of the repeat
    trfDict[scaffold].append([scaffold, start, end, hit, placement, positionFromEitherEnd, repeat_block_seq, repeat_length])



#### Write hits out

In [None]:
outfile = '../../../figshare/annotation/telomeres/UTEX2797_v1_TelomereCandidates_ParsedFromTRF.txt'

#open outfile
outFile = open(outfile, 'w')

#Header line:
headerLine = ['Scaffold', 'Repeat Start', 'Repeat End', 'Match Type (Partial or Perfect)', 'Position on Scaffold',
             'Distance From Scaffold End (bp)', 'Telomeric Repeat Block Sequence', 'Length of Telomeric Repeat (bp)']
outFile.write('\t'.join(map(str, headerLine)) + '\n')

#Write out each
for scaffold in trfDict.keys():
    for t in trfDict[scaffold]:
        #separator.join(map(str, stringaslist))
        
        outFile.write('\t'.join(map(str, t)) + '\n')


#Close outfile
outFile.close()

____

## Going forward...

Each of these candidates was assessed manually by looking first at their repeat structure and lengths, then in their genomic contexts within genome browsers. 

_______

# Done!