# Sliding_Window_Analysis 

## Description:

#### Parsing Function (parse_row):
The parse_row function takes a row from the TSV file as input and extracts relevant information such as query and target virus protein identifiers, query and target range start and end positions.

#### Analysis Function (analyze_tsv):
The analyze_tsv function processes the entire TSV file, using the parse_row function for each row.
It organizes the data into a dictionary (virus_protein_clusters) where each target virus protein has associated query clusters and ranges.
For each target virus protein, the script sorts and merges overlapping or adjacent ranges.

#### Input file:
"filename"_foldseek_reciprocal_hits_noself.tsv 

In [1]:
import csv

# Range from-to
class Range:
    def __init__(self, start, end):
        self.start = start
        self.end = end
        
    def does_start_in(self, other_range):
        return self.start >= other_range.start and self.start <= other_range.end
    
    def starts_seamless_after(self, other_range):
        return self.start == other_range.end + 1
    
    def set_and_expand_if_necessary(self, new_start, new_end):
        self.start = min(self.start, new_start)
        self.end = max(self.end, new_end)
    
    def __str__(self):
        return "{} - {}".format(self.start, self.end)

# Unique assignment of Q_Virus_Protein to a T_Virus_Protein with the associated Range
class Q_T_Range_Cluster:
    def __init__(self, q_name, t_name, q_range, t_range, q_length, t_length):
        self.q_name = q_name
        self.t_name = t_name
        self.q_range = q_range
        self.t_range = t_range
        self.q_length = q_length
        self.t_length = t_length
        
    def is_in_cluster(self, q_range):
        return q_range.starts_seamless_after(self.q_range) or q_range.does_start_in(self.q_range)
    
    def belongs_to_target(self, target_name):
        return self.t_name == target_name
    
    def set_new_ranges(self, new_q_range, new_t_range):
        self.q_range.set_and_expand_if_necessary(new_q_range.start, new_q_range.end)
        self.t_range.set_and_expand_if_necessary(new_t_range.start, new_t_range.end)
    
    # If you want to change the format of how the result is written, this is the place
    def __str__(self):
        return "{} - q-length: {} - q_range: {} - {} - t-length: {} - t_range: {}".format(
            self.q_name, self.q_length, str(self.q_range), self.t_name, self.t_length, str(self.t_range)
        )    
        
# Has a single Q VirusProtein and assigns T VirusProteins in different ranges
# Q_T Range Clusters from the perspective of Q_virus_proteins
# Also, all clusters belonging to a Q_Virus_Protein
class Q_T_Range_Clusters:
    def __init__(self, q_name):
        self.q_name = q_name
        self.targets_with_ranges = {}
        
    def add_target_in_range(self, target_name, q_range: Range, t_range: Range, q_length, t_length):
        if target_name not in self.targets_with_ranges:
            self.targets_with_ranges[target_name] = list()
        self.targets_with_ranges[target_name].append({
            'q_range': q_range,
            't_range': t_range,
            'q_length': q_length,
            't_length': t_length
        })
            
    def get_targets_in_ranges(self):
        result_clusters = list()
        # For each Q_Name, look at the t_name clusters individually
        for target_name in self.targets_with_ranges.keys():
            # Each line within a cluster is assigned q_range, t_range, q_length, and t_length.
            # Examine all for each line
            # Important: We also move across cluster boundaries. However, the Q_Name is the same
            for ranges in self.targets_with_ranges[target_name]:
                q_range = ranges['q_range']
                t_range = ranges['t_range']
                q_length = ranges['q_length']
                t_length = ranges['t_length']
                found_in_clusters = False
                # Examine the previous Ranges for a Q_Name -> T_Name connection
                for result_cluster in result_clusters:
                    # Is the T_Name known (Not the first line in the cluster)
                    # and does the Target belong to the cluster (Are we not beyond a cluster boundary)
                    # and is a seamless transition between the Ranges present
                    # then set the new Ranges (min from the start and max from the end)
                    if result_cluster.belongs_to_target(target_name) and result_cluster.is_in_cluster(q_range) and not found_in_clusters:
                        result_cluster.set_new_ranges(q_range, t_range)
                        found_in_clusters = True
                # If any of the above conditions is not met, then create a new cluster
                if not found_in_clusters:
                    result_clusters.append(Q_T_Range_Cluster(
                        self.q_name, target_name, q_range, t_range, q_length, t_length
                    ))
        return result_clusters
                
    def __str__(self):
        result = ""
        for target_name in self.targets_with_ranges.keys():
            for target_ranges in self.targets_with_ranges[target_name]:
                q_range = target_ranges['q_range']
                t_range = target_ranges['t_range']
                q_length = target_ranges['q_length']
                t_length = target_ranges['t_length']
                result = result + '\n' + "{} - q-length: {} - q_range: {} - {} - t-length: {} - t_range: {}".format(
                    self.q_name, q_length, str(q_range), target_name, t_length, str(t_range)
                )
        return result

def parse_row(row):
    q_tokens = row[0].split('_')
    t_tokens = row[1].split('_')

    # Check if there are 7 parts in q_tokens
    if len(q_tokens) == 7:
        q_virus_protein = f"{q_tokens[0]}_{q_tokens[1]}_{q_tokens[2]}"
    else:
        q_virus_protein = f"{q_tokens[0]}_{q_tokens[1]}"

    # Check if there are 7 parts in t_tokens
    if len(t_tokens) == 7:
        t_virus_protein = f"{t_tokens[0]}_{t_tokens[1]}_{t_tokens[2]}"
    else:
        t_virus_protein = f"{t_tokens[0]}_{t_tokens[1]}"

    q_range_start, q_range_end = int(q_tokens[-2]), int(q_tokens[-1])
    t_range_start, t_range_end = int(t_tokens[-2]), int(t_tokens[-1])
    
    q_length = int(q_tokens[-4])
    t_length = int(t_tokens[-4])

    return q_virus_protein, t_virus_protein, q_range_start, q_range_end, t_range_start, t_range_end, q_length, t_length

def analyze_tsv(file_path):
    virus_protein_clusters = {}

    with open(file_path, 'r', newline='') as tsv_file:
        tsv_reader = csv.reader(tsv_file, delimiter='\t')

        for row in tsv_reader:
            q_virus_protein, t_virus_protein, q_range_start, q_range_end, t_range_start, t_range_end, q_length, t_length = parse_row(row)
            
            if q_virus_protein not in virus_protein_clusters:
                virus_protein_clusters[q_virus_protein] = Q_T_Range_Clusters(q_virus_protein)
            
            q_range = Range(q_range_start, q_range_end)
            t_range = Range(t_range_start, t_range_end)
            virus_protein_clusters[q_virus_protein].add_target_in_range(t_virus_protein, q_range, t_range, q_length, t_length)
    return virus_protein_clusters

def write_tsv(output_file, clusters):
    with open(output_file, 'w', newline='') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t')
        
        # Write headers to the file
        tsv_writer.writerow([
            'q-Virus_Protein', 'q-length', 'q-range from', 'q-range to',
            't-Virus_Protein', 't-length', 't-range from', 't-range to'
        ])
        
        for cluster in clusters:
            tsv_writer.writerow([
                cluster.q_name,
                cluster.q_length,
                cluster.q_range.start,
                cluster.q_range.end,
                cluster.t_name,
                cluster.t_length,
                cluster.t_range.start,
                cluster.t_range.end
            ])

# Input file path specified by the user
input_file_path = r'<filename>_foldseek_reciprocal_hits_noself.tsv'
virus_protein_clusters = analyze_tsv(input_file_path)

# Assuming you want to combine clusters from all q_virus_proteins
combined_clusters = []
for virus_protein_clusters in virus_protein_clusters.values():
    combined_clusters.extend(virus_protein_clusters.get_targets_in_ranges())

# Output file path specified by the user
output_file_path = r'<filename>_foldseek_reciprocal_hits_noself.tsv '

# Write the combined clusters to a TSV file
write_tsv(output_file_path, combined_clusters)
