In [None]:
# Ok, you gotta redo the filtering strategy, it's wrong to filter by pairs, this way gene pairs may be isolated (of course)

# e.g.,

# 2955  HF571520.1_1897__HF571520.1_1897_1878235_18794...  HF571520.1_1897  HF571520.1     1897  (1878235, 1879485)    pos
# The above was hit of 899.1
# 1271  HF571520.1_1898__HF571520.1_1898_1879706_18817...  HF571520.1_1898  HF571520.1     1898  (1879706, 1881700)    neg
# This one hit of 171.1 and 170.1


# 899: 1897 (pos)
# 171: 411, 1898
# 170: 410, 1898


# Solutions:

"""
1. Sort hmmer hits by contig name, perhaps use a dictionary. Perhaps add tag to label containing hmm name
2. Search for hits within the same contig that satisfy the structure
"""

In [26]:
import pandas as pd


def mergeHitsByHMMgroup(hits: pd.DataFrame, input_hmm_groups: list[str]):
    g = hits.groupby(["full"]).groups
    _, group_idxs = list(g.keys()), list(g.values())
    for group in group_idxs:
        hmm_names = set(hits.loc[group, "hmm"].values)
        candidate_hmm_group = [
            hmm_group_str for hmm_group_str in input_hmm_groups
            if hmm_names == set(hmm_group_str.split("|"))
            ]
        if candidate_hmm_group:
            hmm_group = candidate_hmm_group[0]
        else:
            hmm_group = "discard"
        hits.loc[group, "hmm"] = hmm_group

    hits = hits[hits.hmm != "discard"].drop_duplicates()
    return hits



hits = pd.read_csv("/home/robaina/Documents/Pynteny/allhits.csv", index_col=False)

syn_struct = ">TIGR00899.1 0 <(TIGR00171.1|TIGR02084.1) 0 <TIGR00170.1 1 <TIGR00973.1"
hmm_groups = SyntenyParser.getHMMgroupsInStructure(syn_struct)


mergeHitsByHMMgroup(hits, hmm_groups)

Unnamed: 0,full,gene_id,contig,gene_pos,locus_pos,strand,hmm
0,b0070__U00096_70_77620_78799_pos,b0070,U00096,70,"(77620, 78799)",pos,TIGR00899.1
1,b0071__U00096_71_78847_79453_neg,b0071,U00096,71,"(78847, 79453)",neg,TIGR00171.1|TIGR02084.1
3,b0072__U00096_72_79463_80864_neg,b0072,U00096,72,"(79463, 80864)",neg,TIGR00170.1
4,b0074__U00096_74_81957_83529_neg,b0074,U00096,74,"(81957, 83529)",neg,TIGR00973.1
5,b0118__U00096_117_131614_134212_pos,b0118,U00096,117,"(131614, 134212)",pos,TIGR00170.1
6,b0343__U00096_334_361925_363179_neg,b0343,U00096,334,"(361925, 363179)",neg,TIGR00899.1
7,b0352__U00096_343_373867_374881_pos,b0352,U00096,343,"(373867, 374881)",pos,TIGR00973.1
10,b1053__U00096_1019_1114263_1115490_neg,b1053,U00096,1019,"(1114263, 1115490)",neg,TIGR00899.1
14,b1534__U00096_1509_1621331_1622519_pos,b1534,U00096,1509,"(1621331, 1622519)",pos,TIGR00899.1
15,b2170__U00096_2139_2263862_2265044_pos,b2170,U00096,2139,"(2263862, 2265044)",pos,TIGR00899.1


In [21]:
hmm_groups

['TIGR00899.1', 'TIGR00171.1|TIGR02084.1', 'TIGR00170.1', 'TIGR00973.1']

In [10]:
from pynteny.filter import SyntenyParser


syn_struct = ">TIGR00899.1 0 <(TIGR00171.1|TIGR02084.1) 0 <TIGR00170.1 1 <TIGR00973.1"
hmm_groups = SyntenyParser.getHMMgroupsInStructure(syn_struct)


hmm_dict = dict(zip(hmm_groups, range(len(hmm_groups))))
hmm_dict

{'TIGR00899.1': 0,
 'TIGR00171.1|TIGR02084.1': 1,
 'TIGR00170.1': 2,
 'TIGR00973.1': 3}

In [13]:
hmm = "TIGR00171.1"
hmm = "TIGR02084.1"
[val for key, val in hmm_dict.items() if hmm in key]

def assignCodeToHMM(hmm_name: str):
    code = [
        code for hmm_group, code in hmm_dict.items()
        if hmm_name in hmm_group
        ]
    if len(code) > 1:
        raise ValueError(
        f"HMM: {hmm_name} found in more than one hmm group in synteny structure"
        )
    return code[0]

[1]

In [6]:
# leuD: TIGR02084.1 | TIGR00171.1

array(['TIGR00170.1', 'TIGR02083.1'], dtype=object)

In [2]:

from pathlib import Path 

p = "/home/robaina/Documents/Pynteny/tests/"
dir = Path(p)

list(dir.iterdir())

[PosixPath('/home/robaina/Documents/Pynteny/tests/MG1655.fasta'),
 PosixPath('/home/robaina/Documents/Pynteny/tests/MG1655.gb'),
 PosixPath('/home/robaina/Documents/Pynteny/tests/MG1655_results'),
 PosixPath('/home/robaina/Documents/Pynteny/tests/sar11_results')]

In [3]:
dir.is_dir()

True

In [5]:
infile = Path(p) / "merged.fasta"
str(infile)

'/home/robaina/merged.fasta'

In [7]:
file = Path("hola/que.tar.gz")
file.suffix

'.gz'

In [4]:
f"-p {infile}"

'-p /home/robaina/merged.fasta'

In [3]:
infile.parent

infile.stem

'merged'

In [2]:
Path(Path(p))

PosixPath('/home/robaina')

In [10]:
from pathlib import Path, PosixPath

output = Path("tests/outfile.txt")

output.absolute().as_posix()

type(output) == PosixPath

True

In [None]:
from pynteny.preprocessing import removeCorruptedSequences


removeCorruptedSequences(
    fasta_file="/home/robaina/Databases/MAR_database/marref_V6_positioned.faa",
    output_file="/home/robaina/Databases/MAR_database/marref_V6_positioned_clean.faa",
    is_peptide=True,
    keep_stop_codon=True
)

In [1]:
from pynteny.utils import setDefaultOutputPath


name = setDefaultOutputPath(
    input_path="/home/robaina/Databases/MAR_database/marref_assembly_V6.fa",
    only_dirname=True
)

name

'/home/robaina/Databases/MAR_database'

In [1]:
import pandas as pd
from pynteny.utils import readFromPickleFile
from pynteny.filter import LabelParser
from pynteny.filter import SyntenyHMMfilter



synteny_structure = ">TIGR00899.1 0 <TIGR00171.1 0 <TIGR00170.1 1 <TIGR00973.1"
hmm_hits = readFromPickleFile("/home/robaina/Documents/Pynteny/hmm_hits.pickle")


syntenyfilter = SyntenyHMMfilter(hmm_hits, synteny_structure)

In [1]:
import os 

from pynteny.utils import fullPathListDir

for file in fullPathListDir("/home/robaina/Databases/MarRef_1.7/prodigal"):
    if '.gbk' in file:
        os.remove(file)

In [1]:
from Bio import SeqIO, SeqRecord


file = "/home/robaina/Documents/Pynteny/MG1655.gb"
file = "/home/robaina/Databases/MarRef_1.7/prodigal/marref.gbk"
file = "/home/robaina/Documents/Pynteny/BACL149.gbk"
gbk_contigs = list(SeqIO.parse(file, 'genbank'))

In [2]:
gbk_contig = gbk_contigs[0]

[SeqRecord(seq=Seq('ATCAACACCCAGTTTTTTCCAAGGTAAGTTTTCTGGATCGCTTTCGTGAAAACA...GCC'), id='1', name='1', description='Genus species strain strain', dbxrefs=[])]