In [1]:
import pandas as pd
import subprocess
import os

# Consensus vs TP data
In this first case we are not going to use the filtering methods

In [2]:
# Load TP data
TP_data = pd.read_csv("./data/TP_SIDERs_coor.csv", sep=",", header=0)
print(TP_data.shape)
print(TP_data.dtypes)
TP_data.head()

(1606, 14)
qseqid       object
sseqid       object
pident      float64
length        int64
qstart        int64
qend          int64
sstart        int64
send          int64
evalue      float64
bitscore    float64
qlen          int64
slen          int64
sstrand      object
sseq         object
dtype: object


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,1-112979r,LinJ.01,99.614,259,1,259,114017,113760,5.44e-135,472.0,259,278267,minus,GGGGTGAGGCGGCGGCGCACAGACACACACACACACACACACACGC...
1,1-129494r,LinJ.01,100.0,447,1,447,130577,130131,0.0,826.0,447,278267,minus,TGATGACGGGGCGGCGGCGCACAGACACACACACACACACACACAC...
2,1-136422d,LinJ.01,100.0,461,1,461,137458,137918,0.0,852.0,461,278267,plus,TGACGACGGGGTGGGGCGGCGGCGCACAGACACACACACACACACA...
3,1-204526d,LinJ.01,100.0,432,1,432,205558,205989,0.0,798.0,432,278267,plus,GATGACGGGGTGGGGCGGCGGCGCACACACACACACGCACACACCT...
4,1-24471d,LinJ.01,100.0,660,1,660,24099,24758,0.0,1219.0,660,278267,plus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...


In [3]:
# Check if "sstart" > "send"
sum(TP_data["sstart"] > TP_data["send"])

787

In [5]:
# Let's correct it
TP_data.loc[TP_data["sstrand"] == "minus", ["sstart", "send"]] = TP_data.loc[TP_data["sstrand"] == "minus", ["send", "sstart"]].values

In [6]:
# Check if "sstart" > "send", there should be 0
sum(TP_data["sstart"] > TP_data["send"])

0

In [7]:
# Load consensus data
consensus_data = pd.read_csv("./data_software_output/consensus_data_seqs.csv", sep=",", header=0)
print(consensus_data.shape)
print(consensus_data.dtypes)
consensus_data.head()

(2940, 6)
sseqid     object
length      int64
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,length,sstart,send,sstrand,sseq
0,LinJ.01,1000,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,882,24199,25080,plus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
2,LinJ.01,890,35374,36263,plus,CCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...
3,LinJ.01,777,39819,40595,plus,CGCACGCACAGCCACAGCTCACCTGGCACTCTGTTGCACGGCGGCT...
4,LinJ.01,890,54986,55875,plus,CTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...


In [11]:
# Check in consensus data if there are rows with "sstart" > "send"
sum(consensus_data["sstart"] > consensus_data["send"])

0

In [12]:
# Group data by chromosomes
TP_data_grouped = TP_data.groupby("sseqid")
consensus_data_grouped = consensus_data.groupby("sseqid")

In [19]:
path_1 = "./compare/consensus_vs_TP"
compare_dict_1 = {}
Total = 0
for (name1, group1), (name2, group2) in zip(TP_data_grouped, consensus_data_grouped):
    # group1 should be the TP data
    # ------------------------------------------------------------------------------
    path_chr = os.path.join(path_1, name1)
    os.makedirs(path_chr, exist_ok=True)
    # ------------------------------------------------------------------------------
    group1_total = group1[["sseqid", "sstart", "send"]].copy()
    group2_total = group2[["sseqid", "sstart", "send"]].copy()
    # ------------------------------------------------------------------------------
    group1_total.sort_values(by=["sstart", "send"], inplace=True)
    group2_total.sort_values(by=["sstart", "send"], inplace=True)
    # ------------------------------------------------------------------------------
    path_group1_total = os.path.join(path_chr, "group1_total.bed")
    path_group2_total = os.path.join(path_chr, "group2_total.bed")
    # ------------------------------------------------------------------------------
    group1_total.to_csv(path_group1_total, sep="\t", header=False, index=False)  # tabular sep because of bed format
    group2_total.to_csv(path_group2_total, sep="\t", header=False, index=False)  # tabular sep because of bed format

    # ------------------------------------------------------------------------------
    condition = 1
    cmd = f"bedops --element-of {condition} {path_group1_total} {path_group2_total}"
    result = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    result_df = pd.DataFrame([x.split("\t") for x in result.split("\n") if x], 
                                columns=["sseqid", "sstart", "send"])
    result_df[['sstart', 'send']] = result_df[['sstart', 'send']].apply(pd.to_numeric)
    compare_dict_1[name1] = [f"{result_df.shape[0]}/{group1_total.shape[0]}", f"{result_df.shape[0]/group1_total.shape[0]*100:.2f}"]
    Total += result_df.shape[0]
print(f"""
There are {Total} TP sequences of {TP_data.shape[0]} in consensus data:
    - That's {Total/TP_data.shape[0]*100:.2f}% of the TP data
    - {TP_data.shape[0]-Total} TP sequences are not in consensus data, which is {100-Total/TP_data.shape[0]*100:.2f}%
"""
)
compare_dict_1


There are 1595 TP sequences of 1606 in consensus data:
    - That's 99.32% of the TP data
    - 11 TP sequences are not in consensus data, which is 0.68%



{'LinJ.01': ['7/7', '100.00'],
 'LinJ.02': ['18/18', '100.00'],
 'LinJ.03': ['6/8', '75.00'],
 'LinJ.04': ['18/18', '100.00'],
 'LinJ.05': ['18/18', '100.00'],
 'LinJ.06': ['22/22', '100.00'],
 'LinJ.07': ['27/27', '100.00'],
 'LinJ.08': ['18/18', '100.00'],
 'LinJ.09': ['21/21', '100.00'],
 'LinJ.10': ['26/26', '100.00'],
 'LinJ.11': ['39/39', '100.00'],
 'LinJ.12': ['36/36', '100.00'],
 'LinJ.13': ['22/22', '100.00'],
 'LinJ.14': ['66/66', '100.00'],
 'LinJ.15': ['25/25', '100.00'],
 'LinJ.16': ['34/34', '100.00'],
 'LinJ.17': ['34/34', '100.00'],
 'LinJ.18': ['30/30', '100.00'],
 'LinJ.19': ['40/40', '100.00'],
 'LinJ.20': ['35/37', '94.59'],
 'LinJ.21': ['38/39', '97.44'],
 'LinJ.22': ['27/27', '100.00'],
 'LinJ.23': ['60/60', '100.00'],
 'LinJ.24': ['55/57', '96.49'],
 'LinJ.25': ['41/42', '97.62'],
 'LinJ.26': ['45/45', '100.00'],
 'LinJ.27': ['63/63', '100.00'],
 'LinJ.28': ['46/46', '100.00'],
 'LinJ.29': ['74/74', '100.00'],
 'LinJ.30': ['71/71', '100.00'],
 'LinJ.31': ['80/80

# Consensus vs 32 chr data

From here we need the filtering methods:

1. Use `3.1.SIDERs_filter.py`file
2. Filter the data from the consensus software.
3. Keep with this script

In [26]:
# Let's loead the data from the 32 chr
chr32_positive_data = pd.read_csv("../14.From_all_to_real_SIDERs/data//4.SIDERs_filter_32chr/positives_testing_elements.csv", sep=",", header=None)
chr32_positive_data.columns = ["sseqid", "length", "sstart", "send", "sstrand", "sseq"]
print(chr32_positive_data.shape)
print(chr32_positive_data.dtypes)
chr32_positive_data.head()

(2184, 6)
sseqid     object
length      int64
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,length,sstart,send,sstrand,sseq
0,LinJ.01,1000,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,669,24199,24867,plus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
2,LinJ.01,926,35374,36299,plus,CCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...
3,LinJ.01,1128,39790,40917,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,926,54986,55911,plus,CTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...


In [27]:
# Load the positive after the filter data from consensus sequences
consensus_positive_data = pd.read_csv("./data_software_output/positives_testing_elements.csv", sep=",", header=0)
print(consensus_positive_data.shape)
print(consensus_positive_data.dtypes)
consensus_positive_data.head()

(2128, 6)
sseqid     object
length      int64
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,length,sstart,send,sstrand,sseq
0,LinJ.01,1000,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,882,24199,25080,plus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
2,LinJ.01,890,35374,36263,plus,CCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...
3,LinJ.01,777,39819,40595,plus,CGCACGCACAGCCACAGCTCACCTGGCACTCTGTTGCACGGCGGCT...
4,LinJ.01,890,54986,55875,plus,CTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...


In [28]:
# Let's group it
chr32_positive_data_grouped = chr32_positive_data.groupby("sseqid")
consensus_positive_data_grouped = consensus_positive_data.groupby("sseqid")

In [29]:
# Prepare path
path_2 = "./compare/consensus_vs_32chr"
os.makedirs(path_2, exist_ok=True)

In [61]:
# Let's define the function since it seems that we will use it more than once
def compare_sequences(df_group1, df_group2, df_group1_og_data, path):
    compare_dict = {}
    Total = 0
    for (name1, group1), (name2, group2) in zip(df_group1, df_group2):
        # group1 should be the TP data
        # ------------------------------------------------------------------------------
        path_chr = os.path.join(path_1, name1)
        os.makedirs(path_chr, exist_ok=True)
        # ------------------------------------------------------------------------------
        group1_total = group1[["sseqid", "sstart", "send"]].copy()
        group2_total = group2[["sseqid", "sstart", "send"]].copy()
        # ------------------------------------------------------------------------------
        group1_total.sort_values(by=["sstart", "send"], inplace=True)
        group2_total.sort_values(by=["sstart", "send"], inplace=True)
        # ------------------------------------------------------------------------------
        path_group1_total = os.path.join(path_chr, "group1_total.bed")
        path_group2_total = os.path.join(path_chr, "group2_total.bed")
        # ------------------------------------------------------------------------------
        group1_total.to_csv(path_group1_total, sep="\t", header=False, index=False)  # tabular sep because of bed format
        group2_total.to_csv(path_group2_total, sep="\t", header=False, index=False)  # tabular sep because of bed format

        # ------------------------------------------------------------------------------
        condition = 1
        cmd = f"bedops --element-of {condition} {path_group1_total} {path_group2_total}"
        result = subprocess.check_output(cmd, shell=True, universal_newlines=True)
        result_df = pd.DataFrame([x.split("\t") for x in result.split("\n") if x], 
                                    columns=["sseqid", "sstart", "send"])
        result_df[['sstart', 'send']] = result_df[['sstart', 'send']].apply(pd.to_numeric)
        compare_dict[name1] = [f"{result_df.shape[0]}/{group1_total.shape[0]}", f"{result_df.shape[0]/group1_total.shape[0]*100:.2f}"]
        Total += result_df.shape[0]
    print(f"""
    There are {Total} first DF sequences of {df_group1_og_data.shape[0]} in the second DF:
        - That's {Total/df_group1_og_data.shape[0]*100:.2f}% of the TP data
        - {df_group1_og_data.shape[0]-Total} TP sequences are not in consensus data, which is {100-Total/df_group1_og_data.shape[0]*100:.2f}%
    """
    )
    return compare_dict

In [64]:
consensus_vs_32chr = compare_sequences(df_group1=chr32_positive_data_grouped,  # compare if this data is inside.
                                       df_group2=consensus_positive_data_grouped,  # ... this data
                                       df_group1_og_data=chr32_positive_data,  # ... and this is the original data
                                       path=path_2)
consensus_vs_32chr


    There are 2167 first DF sequences of 2184 in the second DF:
        - That's 99.22% of the TP data
        - 17 TP sequences are not in consensus data, which is 0.78%
    


{'LinJ.01': ['13/13', '100.00'],
 'LinJ.02': ['14/16', '87.50'],
 'LinJ.03': ['6/8', '75.00'],
 'LinJ.04': ['12/12', '100.00'],
 'LinJ.05': ['21/22', '95.45'],
 'LinJ.06': ['28/28', '100.00'],
 'LinJ.07': ['32/32', '100.00'],
 'LinJ.08': ['38/38', '100.00'],
 'LinJ.09': ['33/33', '100.00'],
 'LinJ.10': ['44/44', '100.00'],
 'LinJ.11': ['42/42', '100.00'],
 'LinJ.12': ['47/47', '100.00'],
 'LinJ.13': ['32/33', '96.97'],
 'LinJ.14': ['43/43', '100.00'],
 'LinJ.15': ['27/27', '100.00'],
 'LinJ.16': ['38/38', '100.00'],
 'LinJ.17': ['50/50', '100.00'],
 'LinJ.18': ['59/59', '100.00'],
 'LinJ.19': ['58/58', '100.00'],
 'LinJ.20': ['49/49', '100.00'],
 'LinJ.21': ['42/42', '100.00'],
 'LinJ.22': ['62/62', '100.00'],
 'LinJ.23': ['75/76', '98.68'],
 'LinJ.24': ['54/54', '100.00'],
 'LinJ.25': ['41/41', '100.00'],
 'LinJ.26': ['52/53', '98.11'],
 'LinJ.27': ['79/79', '100.00'],
 'LinJ.28': ['59/59', '100.00'],
 'LinJ.29': ['123/127', '96.85'],
 'LinJ.30': ['106/106', '100.00'],
 'LinJ.31': ['1

# Consensus + LmSIDERs vs TP data

In [36]:
# Let's load the dat from the LmSIDERs
LmSIDERs_data = pd.read_csv("../14.From_all_to_real_SIDERs/data/2.SIDERs_filter/1.Merged_data.csv", sep=",", header=0)
print(LmSIDERs_data.shape)
print(LmSIDERs_data.dtypes)
LmSIDERs_data.head()

(2951, 6)
sseqid     object
length      int64
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,length,sstart,send,sstrand,sseq
0,LinJ.01,1000,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,784,24093,24876,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,927,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,806,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,927,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [37]:
# Let's check if there are rows with "sstart" > "send"
sum(LmSIDERs_data["sstart"] > LmSIDERs_data["send"])

0

In [38]:
# Let's make the path
path_3 = "./compare/consensus+LmSIDERs_vs_TP"
os.makedirs(path_3, exist_ok=True)

In [43]:
print(f"""
consensus data is {consensus_data.shape[0]} long
LmSIDERs data is {LmSIDERs_data.shape[0]} long
""")


consensus data is 2940 long
LmSIDERs data is 2951 long



In [41]:
# prepare paths for bedos merging
consensus_bed_path = os.path.join(path_3, "consensus.bed")
LmSIDERs_bed_path = os.path.join(path_3, "LmSIDERs.bed")

In [42]:
# prepare bedops file for the merging
consensus_data[["sseqid", "sstart", "send"]].to_csv(consensus_bed_path, sep="\t", header=False, index=False)
LmSIDERs_data[["sseqid", "sstart", "send"]].to_csv(LmSIDERs_bed_path, sep="\t", header=False, index=False)

In [44]:
# Let's BEDOPS merge both data
cmd =f"bedops --merge {consensus_bed_path} {LmSIDERs_bed_path}"
consensus_LMSIDERs_merged = subprocess.check_output(cmd, shell=True, universal_newlines=True)
consensus_LMSIDERs_merged_df = pd.DataFrame([x.split("\t") for x in consensus_LMSIDERs_merged.split("\n") if x], 
                                            columns=["sseqid", "sstart", "send"])
consensus_LMSIDERs_merged_df[['sstart', 'send']] = consensus_LMSIDERs_merged_df[['sstart', 'send']].apply(pd.to_numeric)
print(consensus_LMSIDERs_merged_df.shape)
print(consensus_LMSIDERs_merged_df.dtypes)
print(f"consensus_LmSIDERs_merged_df is {consensus_LMSIDERs_merged_df.shape[0]} long")
consensus_LMSIDERs_merged_df.head()


(2783, 3)
sseqid    object
sstart     int64
send       int64
dtype: object
consensus_LmSIDERs_merged_df is 2783 long


Unnamed: 0,sseqid,sstart,send
0,LinJ.01,1,1000
1,LinJ.01,24093,25080
2,LinJ.01,35371,36297
3,LinJ.01,39790,40595
4,LinJ.01,54983,55909


In [48]:
# Let's get the sequences
# Now let's get the sequences
def get_data_sequence(data, strand, genome_fasta):
    sequences = []
    for _, row in data.iterrows():
        sseqid = row["sseqid"]
        start = row["sstart"]
        end = row["send"]
        cmd = f"blastdbcmd -db {genome_fasta} -entry {sseqid} -range {start}-{end} -strand {strand} -outfmt %s"

        sequence = subprocess.check_output(cmd, shell=True, universal_newlines=True).replace('\n', '')

        sequences.append({
            "sseqid": sseqid,
            "sstart": start,
            "send": end,
            "sstrand": strand,
            "sseq": sequence
        })

    sequences_df = pd.DataFrame(sequences)
    return sequences_df

In [49]:
# Let's get the sequences
consensus_LMSIDERs_seqs = get_data_sequence(data=consensus_LMSIDERs_merged_df,
                                            strand="plus",
                                            genome_fasta="./data/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta")
print(consensus_LMSIDERs_seqs.shape)
print(consensus_LMSIDERs_seqs.dtypes)
consensus_LMSIDERs_seqs.head()

(2783, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [55]:
# Save the data
path_consensus_LMSIDERs_seqs = os.path.join(path_3, "consensus_LmSIDERs_seqs.csv")
consensus_LMSIDERs_seqs.to_csv(path_consensus_LMSIDERs_seqs, sep=",", header=True, index=False)

In [51]:
# Let's group this last data
consensus_LMSIDERs_seqs_grouped = consensus_LMSIDERs_seqs.groupby("sseqid")

In [63]:
consensus_LMSIDERs_vs_TP = compare_sequences(df_group1=TP_data_grouped,  # compare if this data is inside.
                                             df_group2=consensus_LMSIDERs_seqs_grouped,  # ... this data
                                             df_group1_og_data=TP_data,  # ... and this is the original data
                                             path=path_3)
consensus_LMSIDERs_vs_TP


    There are 1595 first DF sequences of 1606 in the second DF:
        - That's 99.32% of the TP data
        - 11 TP sequences are not in consensus data, which is 0.68%
    


{'LinJ.01': ['7/7', '100.00'],
 'LinJ.02': ['18/18', '100.00'],
 'LinJ.03': ['6/8', '75.00'],
 'LinJ.04': ['18/18', '100.00'],
 'LinJ.05': ['18/18', '100.00'],
 'LinJ.06': ['22/22', '100.00'],
 'LinJ.07': ['27/27', '100.00'],
 'LinJ.08': ['18/18', '100.00'],
 'LinJ.09': ['21/21', '100.00'],
 'LinJ.10': ['26/26', '100.00'],
 'LinJ.11': ['39/39', '100.00'],
 'LinJ.12': ['36/36', '100.00'],
 'LinJ.13': ['22/22', '100.00'],
 'LinJ.14': ['66/66', '100.00'],
 'LinJ.15': ['25/25', '100.00'],
 'LinJ.16': ['34/34', '100.00'],
 'LinJ.17': ['34/34', '100.00'],
 'LinJ.18': ['30/30', '100.00'],
 'LinJ.19': ['40/40', '100.00'],
 'LinJ.20': ['35/37', '94.59'],
 'LinJ.21': ['38/39', '97.44'],
 'LinJ.22': ['27/27', '100.00'],
 'LinJ.23': ['60/60', '100.00'],
 'LinJ.24': ['55/57', '96.49'],
 'LinJ.25': ['41/42', '97.62'],
 'LinJ.26': ['45/45', '100.00'],
 'LinJ.27': ['63/63', '100.00'],
 'LinJ.28': ['46/46', '100.00'],
 'LinJ.29': ['74/74', '100.00'],
 'LinJ.30': ['71/71', '100.00'],
 'LinJ.31': ['80/80

# Consensus + LmSIDERS vs 32 chr data

In [54]:
# Prepare paths
path_4 = "./compare/consensus+LmSIDERs_vs_32chr"
os.makedirs(path_4, exist_ok=True)

From here we need the filtering methods:

1. Use `3.1.SIDERs_filter.py`file
2. Filter the data from consensus + LmSIDERs.
3. Keep with this script

In [56]:
# Load data after the filter
consensus_LmSIDERS_positive_data = pd.read_csv("./compare/consensus+LmSIDERs_vs_TP/positives_testing_elements.csv", sep=",", header=0)
print(consensus_LmSIDERS_positive_data.shape)
print(consensus_LmSIDERS_positive_data.dtypes)
consensus_LmSIDERS_positive_data.head()

(2069, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [57]:
# Group this data
consensus_LmSIDERS_positive_data_grouped = consensus_LmSIDERS_positive_data.groupby("sseqid")

In [62]:
consensus_LMSIDERs_vs_32chr = compare_sequences(df_group1=chr32_positive_data_grouped,  # compare if this data is inside.
                                                df_group2=consensus_LmSIDERS_positive_data_grouped,  # ... this data
                                                df_group1_og_data=chr32_positive_data,  # ... and this is the original data
                                                path=path_4)
consensus_LMSIDERs_vs_32chr


    There are 2172 first DF sequences of 2184 in the second DF:
        - That's 99.45% of the TP data
        - 12 TP sequences are not in consensus data, which is 0.55%
    


{'LinJ.01': ['13/13', '100.00'],
 'LinJ.02': ['16/16', '100.00'],
 'LinJ.03': ['6/8', '75.00'],
 'LinJ.04': ['12/12', '100.00'],
 'LinJ.05': ['21/22', '95.45'],
 'LinJ.06': ['28/28', '100.00'],
 'LinJ.07': ['32/32', '100.00'],
 'LinJ.08': ['38/38', '100.00'],
 'LinJ.09': ['33/33', '100.00'],
 'LinJ.10': ['44/44', '100.00'],
 'LinJ.11': ['42/42', '100.00'],
 'LinJ.12': ['47/47', '100.00'],
 'LinJ.13': ['32/33', '96.97'],
 'LinJ.14': ['43/43', '100.00'],
 'LinJ.15': ['27/27', '100.00'],
 'LinJ.16': ['38/38', '100.00'],
 'LinJ.17': ['50/50', '100.00'],
 'LinJ.18': ['59/59', '100.00'],
 'LinJ.19': ['58/58', '100.00'],
 'LinJ.20': ['49/49', '100.00'],
 'LinJ.21': ['42/42', '100.00'],
 'LinJ.22': ['62/62', '100.00'],
 'LinJ.23': ['75/76', '98.68'],
 'LinJ.24': ['54/54', '100.00'],
 'LinJ.25': ['41/41', '100.00'],
 'LinJ.26': ['53/53', '100.00'],
 'LinJ.27': ['79/79', '100.00'],
 'LinJ.28': ['59/59', '100.00'],
 'LinJ.29': ['124/127', '97.64'],
 'LinJ.30': ['106/106', '100.00'],
 'LinJ.31': [