In [3]:
import pandas as pd

In [1]:
datafile = "../TbHmmer.domtbl"

In [2]:
header = ["PfamShortName", "target", "tlen", "query", "qacc", "qlen", "evalue", "bits", "bias", "#", "of", 
         "c-Evalue", "i-Evalue", "dom_bitscore", "dom_bias", "tstart", "tend", "qstart", "qend", "env_start", "env_end",
         "acc", "PfamDesc"]

In [4]:
def process_line(line, num_columns):
    parts = line.split(None, num_columns-1)  # Split by the first num_columns-1 whitespaces
    return parts    

In [5]:
def ReadAsDataFrame(filepath):
    lines = []
    with open(filepath, "r") as f:
        for line in f:
            if not line.startswith("#"): 
                lines.append(process_line(line.strip(), len(header)))
    # Convert to DataFrame
    df = pd.DataFrame(lines, columns = header)
    df = df.apply(pd.to_numeric, errors='ignore')
    return df

In [6]:
df = ReadAsDataFrame(datafile)

In [7]:
df["query"] = df["query"].str.split("-", expand=True)[1]
df["predpf"] = df["target"].str.split(".", expand=True)[0]
df["tcov"] = (df["tend"] - df["tstart"] +1)/df["tlen"]

In [8]:
def get_non_overlapping_regions(group):
    non_overlapping = []
    for idx, current in group.iterrows():
        overlapping = False
        for accepted in non_overlapping:
            if (current['qstart'] <= accepted['qend'] and current['qend'] >= accepted['qstart']):
                overlapping = True
                break
        if not overlapping:
            non_overlapping.append(current)
    return pd.DataFrame(non_overlapping)

In [9]:
result = df.groupby("query").apply(get_non_overlapping_regions).reset_index(drop=True)

In [10]:
result.to_csv("../greedy_hits/HmmerGreedyHits.tsv", sep="\t", index=None)