In [19]:
import pandas as pd
import numpy as np
import Bio
from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import matplotlib.pyplot as plt

from pathlib import Path

In [2]:
train_test_path = Path("iadh_out/ath_bol_aar/train_test2.tsv")
iadh_merged_path = Path("iadh_out/ath_bol_aar/merged_results.tsv")
aar_path = Path("data/annotation.all_transcripts.aar.csv")


In [3]:
merged = pd.read_csv(iadh_merged_path, sep="\t", header=0)
train_test = pd.read_csv(train_test_path, sep="\t", header=0, index_col=0)
aar = pd.read_csv(aar_path, sep="\t", header=0, skiprows=8)
aar.rename(columns={"#gene_id": "gene_id"}, inplace=True)

In [4]:
train_test.head()

Unnamed: 0,segment_id,Similar,genome_x,chr_x,len_profile_x,genome_y,chr_y,len_profile_y,seq_x,seq_y
0,4053.0,True,aar,LG-7,5,ath,Chr1,5,TACTCGTCAATATCCTTTACAGCATCCTGTAAAGCACGACACCAAC...,TACTTTGCATCTCTCTTCTCATCATCATCAATATCCTTTATAGCAT...
1,4071.0,True,aar,sc-179,6,ath,Chr1,5,TAGTTTCGGGACATGGTTGGTTCAAGCAGCGGAGGCGCAGTCGCAA...,TGGCGTGGTCGTCGGAAACGCCGTCGTATTGCGGCTGGAATGAGCG...
2,4073.0,True,bol,C8,5,ath,Chr1,6,CAGGCGCTGTTACTCCTCCGTTTCTTACGGACGTGGTTAGGGTTAT...,CAGGCACTATTACTCCTTCGTTTCTTACGGACATGGTTTGGTTCGT...
3,4084.0,True,bol,C9,6,ath,Chr1,7,TGATATCGTGGCTGATGACTATCAAGGCGGTGCTAATATCTACCGG...,TGCAGATTGAGATTGACCATCTTAACAATGAGCATCTTGAATCTGT...
4,4089.0,True,aar,sc-136,7,ath,Chr1,5,TGGGAGATGAAGATAAATTGCTACCAATAGCAAATGTAGGAAGAAT...,TGGGGGCCAAAAGCAAAAGTTCGAGTACGAGATTTTTTATGTTTTA...


In [24]:
id, wholse_seq, chr = train_test.query('genome_x == "aar"').iloc[1][["segment_id", "seq_x", "chr_x"]]
id = int(id)
f_gene = merged.query(f'id == {id}')["first_x"].item()
l_gene = merged.query(f'id == {id}')["last_x"].item()

f_ref_seq = aar.query('gene_id == @f_gene')["seq"].item()
l_ref_seq = aar.query('gene_id == @l_gene')["seq"].item()

wholse_seq = Seq(wholse_seq)
f_ref_seq = Seq(f_ref_seq)
l_ref_seq = Seq(l_ref_seq)

print(f"id: {id}, first gene: {f_gene}, last gene: {l_gene}, chr: {chr}, seq[:10]: {len(wholse_seq)}, f_ref_seq[:10] {len(f_ref_seq)}, l_ref_seq[:10] {len(l_ref_seq)},")

id: 4071, first gene: Aa31sc179G20, last gene: Aa31sc179G70, chr: sc-179, seq[:10]: 46209, f_ref_seq[:10] 468, l_ref_seq[:10] 1905,


In [25]:
merged.query(f'id == {id}')[["id", "genome_x", "list_x", "begin_x", "end_x", "first_x", "last_x", "start_x", "stop_x"]]
# merged.columns

Unnamed: 0,id,genome_x,list_x,begin_x,end_x,first_x,last_x,start_x,stop_x
4070,4071,aar,sc-179,1,6,Aa31sc179G20,Aa31sc179G70,7403,53612


In [26]:
print(f_ref_seq.reverse_complement())
print(f_ref_seq.reverse_complement()[0]+wholse_seq)

CTAGTTTCGGGACATGGTTGGTTCAAGCAGCGGAGGCGCAGTCGCAATCACAGCAATTCGTGCTCGACGTTGCATATTACCATACCATCCAATAGACAAGTCAAAGGACTGTTGACGAAGCTGTCTATACTCTTGACAAAGAGCACATAGATGACAACAACAATGGACACAACAATCGCAACATGGTCTCTCCTTTAACTCATATTGTCCTCTTAATTTAGTCCGATAAAAGCACGAATAAAGGCTAGTCCCAACACAGCCTGTAGCCATCATTAACATCATGTACATGGCTCCACTCATCCCACATGATGTAGAGCCTCTGTCTACAATCTCAGCGATCCGGCCAAAGGCTACGCAGGGACACCAACACGTCAAGCAACATGAGTGGAGATCAAGACAACATTCACAAAGACCAGTGGACCAATCCATGGATTTGCCTTTAACAATGGAGGTTGTTGGAGGCTCCAT
CTAGTTTCGGGACATGGTTGGTTCAAGCAGCGGAGGCGCAGTCGCAATCACAGCAATTCGTGCTCGACGTTGCATATTACCATACCATCCAATAGACAAGTCAAAGGACTGTTGACGAAGCTGTCTATACTCTTGACAAAGAGCACATAGATGACAACAACAATGGACACAACAATCGCAACATGGTCTCTCCTTTAACTCATATTGTCCTCTTAATTTAGTCCGATAAAAGCACGAATAAAGGCTAGTCCCAACACAGCCTGTAGCCATCATTAACATCATGTACATGGCTCCACTCATCCCACATGATGTAGAGCCTCTGTCTACAATCTCAGCGATCCGGCCAAAGGCTACGCAGGGACACCAACACGTCAAGCAACCTGTTACGTCTCAAAATATACATAAATAAGTAACACACACATATATTATATATAATATGTAGAGTAAATATATCATGACTTACATGAGTGGAGATCAAGACAACATTCACAAAGACCAGTGGACCAATCCATGGATTTGCCTTTAACAATG

In [27]:
align = pairwise2.align.globalms(wholse_seq, f_ref_seq.reverse_complement(), 2, -1, -.5, -.1)
print(format_alignment(*align[0]))

-TAGTTTCGGGACATGGTTGGTTCAAGCAGCGGAGGCGCAGTCGCAATCACAGCAATTCGTGCTCGACGTTGCATATTACCATACCATCCAATAGACAAGTCAAAGGACTGTTGACGAAGCTGTCTATACTCTTGACAAAGAGCACATAGATGACAACAACAATGGACACAACAATCGCAACATGGTCTCTCCTTTAACTCATATTGTCCTCTTAATTTAGTCCGATAAAAGCACGAATAAAGGCTAGTCCCAACACAGCCTGTAGCCATCATTAACATCATGTACATGGCTCCACTCATCCCACATGATGTAGAGCCTCTGTCTACAATCTCAGCGATCCGGCCAAAGGCTACGCAGGGACACCAACACGTCAAGCAACCTGTTACGTCTCAAAATATACATAAATAAGTAACACACACATATATTATATATAATATGTAGAGTAAATATATCATGACTTACATGAGTGGAGATCAAGACAACATTCACAAAGACCAGTGGACCAATCCATGGATTTGCCTTTAACAATGGAGGTTGTTGGAGGCTCCATGTGTCTTTTTTTTATTCTCTTTTGTTTTTTTTTTTTCTTCTCTTGTGGTTATTTTATTAATCGTTAGGTAGTTATATGTAAAGGACGCGTATATTTATGGCACCACTTGATGAACTAACCGTCAGGAACCGCAAGGATGCTTCGTTTGGTGACTCAAAATCTAGACGCCTTTTTAATCATAATTATAAATTTATTAATAATCAAACTGAAATTATTTCATTACAAAGTAAAACTAAGGAGGTGCAGACACGATTAATAGTAGCAGTATATTTATTTAAATAGGCATACAAATTCAGTTATAAATTTATTTACAATTAATTGTGTCAAATTTGACATGTGAATGAATATATCACACAATAAATTCAAAATAATTATTATTGAATCAATTATATATCCCTATAGTATTTTATGATTTGATCATTTTAACATTATGCTTTAAGTATAATATA

In [32]:
align = pairwise2.align.globalms(wholse_seq, l_ref_seq.reverse_complement(), 2, -1, -.5, -.1)
print(format_alignment(*align[0]))

TAGTTTCGGGACATGGTTGGTTCAAGCAGCGGAGGCGCAGTCGCAATCACAGCAATTCGTGCTCGACGTTGCATATTACCATACCATCCAATAGACAAGTCAAAGGACTGTTGACGAAGCTGTCTATACTCTTGACAAAGAGCACATAGATGACAACAACAATGGACACAACAATCGCAACATGGTCTCTCCTTTAACTCATATTGTCCTCTTAATTTAGTCCGATAAAAGCACGAATAAAGGCTAGTCCCAACACAGCCTGTAGCCATCATTAACATCATGTACATGGCTCCACTCATCCCACATGATGTAGAGCCTCTGTCTACAATCTCAGCGATCCGGCCAAAGGCTACGCAGGGACACCAACACGTCAAGCAACCTGTTACGTCTCAAAATATACATAAATAAGTAACACACACATATATTATATATAATATGTAGAGTAAATATATCATGACTTACATGAGTGGAGATCAAGACAACATTCACAAAGACCAGTGGACCAATCCATGGATTTGCCTTTAACAATGGAGGTTGTTGGAGGCTCCATGTGTCTTTTTTTTATTCTCTTTTGTTTTTTTTTTTTCTTCTCTTGTGGTTATTTTATTAATCGTTAGGTAGTTATATGTAAAGGACGCGTATATTTATGGCACCACTTGATGAACTAACCGTCAGGAACCGCAAGGATGCTTCGTTTGGTGACTCAAAATCTAGACGCCTTTTTAATCATAATTATAAATTTATTAATAATCAAACTGAAATTATTTCATTACAAAGTAAAACTAAGGAGGTGCAGACACGATTAATAGTAGCAGTATATTTATTTAAATAGGCATACAAATTCAGTTATAAATTTATTTACAATTAATTGTGTCAAATTTGACATGTGAATGAATATATCACACAATAAATTCAAAATAATTATTATTGAATCAATTATATATCCCTATAGTATTTTATGATTTGATCATTTTAACATTATGCTTTAAGTATAATATAC