In [188]:
import re

# [Sequences]
# FrontSequence = AGCTAGGCTTAATCG
# BackSequence = ATCGTTGGAGTCA
# CoreUnit = ATCG

# [Repetitions]
# 1 = ATCG
# 2 = TTAA
# 3 = GAGT

In [192]:
# 解析INI配置文件
def parse_ini_file(file_path):
    ini_config = {}  # 初始化配置字典
    with open(file_path, 'r') as file:
        current_section = None
        for line in file:
            line = line.strip()  # 去除行首尾的空白字符
            if line.startswith('[') and line.endswith(']'):  # [Sequences]
                current_section = line[1:-1]  # 解析当前节的名称，Sequences
                ini_config[current_section] = {}  # 在配置字典中创建当前节的空字典
            elif '=' in line:  # FrontSequence = AGCTAGGCTTAATCG
                # 这是一个字符串的方法，用于将字符串按照指定的分隔符进行分割。
                # 在这里，使用等号（'='）作为分隔符。参数1表示最多分割成两部分，即将分割后的结果存储为一个包含两个元素的列表。
                # key, value：这是一个解构赋值的语法，将列表中的两个元素分别赋值给变量key和value。
                # 通过这行代码，可以将包含键值对的字符串按照等号进行分割，并将分割后的键和值分别存储在变量key和value中。这样就可以处理INI配置文件中的键值对信息。
                key, value = line.split('=', 1)  # 解析键值对
                ini_config[current_section][key.strip()] = value.strip()  # 去除行首尾的空白字符,将键值对存储在当前节的字典中
    return ini_config
ini_config = parse_ini_file('example.ini')  # 解析INI配置文件
ini_config

{'Sequences': {'FrontSequence': 'AGCTAGGCTTAATCG',
  'BackSequence': 'ATCGTTGGAGTCA',
  'CoreUnit': 'ATCG,TAGG'},
 'Repetitions': {'1': 'ATCG', '2': 'TTAA', '3': 'GAGT'}}

In [193]:
def front_fuzzy_match(pattern, sequence):
    patternCopy = pattern
    pattern = list(pattern)
    sequence = list(sequence)
    len_pattern = len(pattern)
    len_sequence = len(sequence)
    if len_pattern > len_sequence + 1:
        return [-1, '-1', '未匹配成功', len_pattern]
    if pattern == sequence[:len_pattern]:
        return [''.join(pattern), '0', '无突变', len_pattern]
    # 判断是否为替换
    count = 0
    pos = -1
    c = ''
    for i in range(len_pattern):
        if pattern[i] != sequence[i]:
            count += 1
            pos = i
            c = sequence[i]
            if count >= 2:
                break
    if count <= 1:
        return [''.join(sequence[:len_pattern]),  '='+str(pos)+c, f'替换了索引{pos}的位置为{c}', len_pattern]
    
    else:  
        # 判断是否缺失
        pos = -1
        count = 0
        c = ''
        for i in range(len_pattern-1):
            if pattern[i] != sequence[i]:
                count += 1
                if count >= 2:
                    break
                pos = i
                c = pattern[i]
                del pattern[i]
        if count <= 1:
            return [''.join(sequence[:len_pattern]),  '-'+str(pos)+c, f'缺失了索引{pos}的位置，缺失了{c}', len_pattern-1]
        else: # 判断是否新增
            pattern = patternCopy
            pos = -1
            count = 0
            c = ''
            for i in range(len_pattern):
                if pattern[i] != sequence[i]:
                    
                    count += 1
                    if count >= 2:
                        break
                    pos = i
                    c = sequence[i]
                    del sequence[i]
            if count <= 1:
                return [''.join(sequence[:len_pattern+1]), '+'+str(pos)+c, f'新增了索引{pos}的位置，新增了{c}', len_pattern+1]
    return [-1, '-1', '未匹配成功', len_pattern]
# 正常的示例
pattern =  'AAGC'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
print(front_fuzzy_match(pattern, sequence))        
pattern =  'AAGCTAGGCTTAATCG'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
print(front_fuzzy_match(pattern, sequence))

# 替换的示例
pattern =  'AATC'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
print(front_fuzzy_match(pattern, sequence))        
pattern =  'ABGCTAGGCTTAAT'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
print(front_fuzzy_match(pattern, sequence))

# 缺失的示例
pattern =  'AAGT'
sequence = 'AATAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
print(front_fuzzy_match(pattern, sequence))
pattern =  'AATAGGCTTAATCG'
sequence = 'AAAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
print(front_fuzzy_match(pattern, sequence))

# 新增的示例
pattern =  'AATAG'
sequence = 'AAATAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
print(front_fuzzy_match(pattern, sequence))
pattern =  'AAAGGCTTAATCGAT'
sequence = 'CTTAATCGATCGATCGATCGTAGGATCGATCGATCGATCGTTGGAGATCGATCGATCGATCGTTGGAGCA'
print(front_fuzzy_match(pattern, sequence))

['AAGC', '0', '无突变', 4]
['AAGCTAGGCTTAATCG', '0', '无突变', 16]
['AAGC', '=2G', '替换了索引2的位置为G', 4]
['AAGCTAGGCTTAAT', '=1A', '替换了索引1的位置为A', 14]
['AATA', '-2G', '缺失了索引2的位置，缺失了G', 3]
['AAAGGCTTAATCGA', '-2T', '缺失了索引2的位置，缺失了T', 13]
['AATAGG', '+2A', '新增了索引2的位置，新增了A', 6]
[-1, '-1', '未匹配成功', 15]


In [181]:
def back_fuzzy_match(pattern, sequence):
    patternCopy = pattern
    sequenceCopy = sequence
    pattern = list(pattern)
    sequence = list(sequence)
    len_pattern = len(pattern)
    len_sequence = len(sequence)
    if len_pattern > len_sequence + 1:
        return [-1, '-1', '未匹配成功', len_pattern]
    if pattern == sequence[-len_pattern:]:
        return [''.join(pattern), '0', '无突变', len_pattern]
    # 判断是否为替换
    count = 0
    pos = -1
    c = ''
    for i in range(len_pattern):
        if pattern[i] != sequence[len_sequence - len_pattern + i]:
            count += 1
            pos = i
            c = sequence[len_sequence - len_pattern + i]
            if count >= 2:
                break
    if count <= 1:
        return [''.join(sequence[-len_pattern:]),  '='+str(pos)+c, f'替换了索引{pos}的位置为{c}', len_pattern]
    
    else:  
        # 判断是否缺失
        pos = -1
        count = 0
        c = ''
        for i in range(len_pattern-1):
            if pattern[i] != sequence[len_sequence - len_pattern + i + 1]:
                count += 1
                if count >= 2:
                    break
                pos = i
                c = pattern[i]
                del pattern[i]
        if count <= 1:
            if pos == -1:
                c = pattern[-1]
                return [''.join(sequence[-len_pattern+1:]),  '-'+str(len_pattern-1)+c, f'缺失了索引{len_pattern-1}的位置，缺失了{c}', len_pattern-1]
            print(pos, i, c, len_pattern, pattern)
            return [''.join(sequence[-len_pattern+1:]),  '-'+str(pos)+c, f'缺失了索引{pos}的位置，缺失了{c}', len_pattern-1]
        else: # 判断是否新增
            pattern = patternCopy
            pos = -1
            count = 0
            c = ''
            for i in range(len_pattern):
                if pattern[i] != sequence[len_sequence - len_pattern + i - 1]:
                    
                    count += 1
                    if count >= 2:
                        break
                    pos = i
                    c = sequence[len_sequence - len_pattern + i - 1]
                    del sequence[len_sequence - len_pattern + i - 1]
            if count <= 1:
                if pos == -1:
                    c = sequenceCopy[-1] # pos
                    return [''.join(sequenceCopy[-len_pattern-1:]), '+'+str(len_pattern)+c, f'新增了索引{len_pattern}的位置，新增了{c}', len_pattern+1]
                return [''.join(sequenceCopy[-len_pattern-1:]), '+'+str(pos)+c, f'新增了索引{pos}的位置，新增了{c}', len_pattern+1]
    return [-1, '-1', '未匹配成功', len_pattern]

# # 正常的示例
# pattern =  'AGTCA'
# sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
# print(back_fuzzy_match(pattern, sequence))        
# pattern =  'CGATCGTTGGAGTCA'
# sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
# print(back_fuzzy_match(pattern, sequence))

# # 替换的示例
# pattern =  'AGTAA'
# sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
# print(back_fuzzy_match(pattern, sequence))        
# pattern =  'TCGATCGTAGGAGTCA'
# sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA'
# print(back_fuzzy_match(pattern, sequence))

# # 缺失的示例
# pattern =  'AGTCA'
# sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTA'
# print(back_fuzzy_match(pattern, sequence))        
# pattern =  'ATCGATCGTTGGAGTCA'
# sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGCA'
# print(back_fuzzy_match(pattern, sequence))

# 新增的示例
pattern =  'AGTCA'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCAA'
print(back_fuzzy_match(pattern, sequence))  
pattern =  'AGTCA'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTC'
print(back_fuzzy_match(pattern, sequence))     
pattern =  'AGTCA'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCCA'
print(back_fuzzy_match(pattern, sequence))           
pattern =  'CGATCGTTGGAGTCA'
sequence = 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTACA'
print(back_fuzzy_match(pattern, sequence))

['AGTCAA', '+5A', '新增了索引5的位置，新增了A', 6]
['AGTC', '-4A', '缺失了索引4的位置，缺失了A', 4]
['AGTCCA', '+4C', '新增了索引4的位置，新增了C', 6]
['CGATCGTTGGAGTACA', '+13A', '新增了索引13的位置，新增了A', 16]


In [191]:

def decode_core(patterns, sequence):
    for i in patterns:
        pattern = '('+i+')+'
    # 使用sub函数替换匹配项为[ATCG]n形式
    # re.sub(pattern, replacement, string): 这是re模块中用于替换的函数。它接受三个参数：
    # pattern: 正则表达式，用于匹配需要替换的文本。
    # replacement: 替换的内容。在这里我们使用了一个lambda表达式来动态计算重复的次数，并拼接成形如[ATCG]n的格式。
    # string: 要搜索和替换的原始字符串。
    # '(ATCG)+': 这是正则表达式的模式，用于匹配连续出现的ATCG序列。+表示匹配前面的表达式（这里是ATCG）一次或多次，因此它会匹配一个或多个连续的ATCG字符串。
    # lambda x: f"[ATCG]{int(len(x.group())/4)}": 这是替换函数，使用lambda表达式定义。当re.sub找到匹配项后，它将调用这个函数来生成替换内容。这个函数的输入x是一个re.Match对象，表示找到的匹配项。
    # x.group(): 这是re.Match对象的方法，用于返回找到的匹配项。在这里，x.group()会返回匹配的连续ATCG序列。
    # len(x.group()): 这将计算匹配项的长度，也就是连续ATCG序列的字符个数。
    # int(len(x.group())/4): 这是计算重复的次数n。因为我们知道每个[ATCG]的长度是4，所以通过将匹配项长度除以4取整即可得到重复的次数。
    # f"[ATCG]{int(len(x.group())/4)}": 这将构建替换内容，使用f-string来将计算得到的重复次数n插入到"[ATCG]"中，形成形如[ATCG]n的格式。
    # 最终，re.sub函数将匹配到的连续ATCG序列替换为了[ATCG]n的形式
        sequence = re.sub(pattern, lambda x: f"[{i}]{int(len(x.group())/len(i))}", sequence)
    return sequence

# 示例
# s = 'ATCGATCGTAGGTAGGTAGGATCGATCGTAGGTAGGTAGGTAGGATCGATCG'
# pattern = ['ATCG', 'TAGG']
# print(decode_core(pattern, s))

# pattern = ['ATCG', 'TAGGTAGG']
# print(decode_core(pattern, s))

[ATCG]2[TAGG]3[ATCG]2[TAGG]4[ATCG]2
[ATCG]2[TAGGTAGG]1TAGG[ATCG]2[TAGGTAGG]2[ATCG]2


In [187]:
def decode_ini(file_path, sequence):
    ini_config = parse_ini_file(file_path)
    FrontSequence = ini_config['Sequences']['FrontSequence']
    BackSequence = ini_config['Sequences']['BackSequence']
    CoreUnit = ini_config['Sequences']['CoreUnit'].split(',')
    frontLi = front_fuzzy_match(FrontSequence, sequence)
    Front = sequence[:frontLi[3]]
    # print(CoreUnit, ini_config, type(CoreUnit))
    # print(sequence[frontLi[3]:])
    # print(Front)
    backLi = back_fuzzy_match(BackSequence, sequence)
    # print(backLi)
    Back = sequence[len(sequence)-backLi[3]:]
    # print(Back)
    core_sequence = sequence[frontLi[3]:len(sequence)-backLi[3]]
    # print(core_sequence)
    # 定义正则表达式，匹配所有ATCG重复序列
    # pattern = r'(ATCG)+'
    core = decode_core(CoreUnit, core_sequence)
    # print(core)
    print("前侧翼序列：", Front, frontLi[1], frontLi[2])
    print("后侧翼序列：", Back, backLi[1], backLi[2])
    print("核心区：", core)


# AGCTAGGCTTAATCG ATCG ATCG ATCG ATCG ATCG ATCG ATCG TTGGAG ATCG ATCG ATCG ATCGTTGGAGTCA
print('测试')
print('\n正常：')
decode_ini('example.ini', 'AGCTAGGCTTAATCGGATCGATVJCGATCGATCGATCGATCGATCGTTGGAGATCGATCGATCGATCGTTGGAGTC')
# print('\n替换：')
# decode_ini('example.ini', 'ACCTAGGCTTAATCGATCGATCGATCGATCGATCGATCGATCGTTGGAGATCGATCGATCGATCGTTGGAGCCA')
# print('\n缺少：')
# decode_ini('example.ini', 'ACTAGGCTTAATCGATCGATCGATCGTAGGATCGATCGATCGATCGTTGGAGATCGATCGATCGATCGTTGGAGCA')
# print('\n新增：')
# decode_ini('example.ini', 'AGGCTAGGCTTAATCGATCGATCGATCGATCGATCGATCGATCGTTGGAGATCGATCGATCGATCGTTGGAGTTCA')
# print('\n示例：')
# # AAGCTAGGCTTAATCG ATCGATCGATCG ATCGTTGGAGTCA
# decode_ini('example.ini', 'AAGCTAGGCTTAATCGATCGATCGATCGATCGTTGGAGTCA')
# print('\n无法匹配：')
# decode_ini('example.ini', 'CTTAATCGATCGATCGATCGTAGGATCGATCGATCGATCGTTGGAGATCGATCGATCGATCGTTGGAGCA')

# print('\nTAGG示例：')
# decode_ini('example.ini', 'ATCGATCGTAGGTAGGTAGGATCGATCGTAGGTAGGTAGGTAGGATCGATCG')

测试

正常：
前侧翼序列： AGCTAGGCTTAATCG 0 无突变
后侧翼序列： ATCGTTGGAGTC -12A 缺失了索引12的位置，缺失了A
核心区： G[ATCG]1ATVJCG[ATCG]5TTGGAG[ATCG]3
