In [1]:
##Import
from Bio import SeqIO
import pandas as pd
from collections import Counter
import fuzzysearch
import matplotlib as plt
plt.rcParams['pdf.fonttype'] = 42

In [2]:
##Load key of conditions and files
samples = pd.read_excel("Edit_site_analysis_demo_key.xls")[["Phage","Gene","Plasmid","Direction","edit name","Genome Position","WT nt","Edited nt","L_inside","R_inside","L_outside","R_outside","Rep1","Rep2","Rep3"]]

In [3]:
samples

Unnamed: 0,Phage,Gene,Plasmid,Direction,edit name,Genome Position,WT nt,Edited nt,L_inside,R_inside,L_outside,R_outside,Rep1,Rep2,Rep3
0,Lambda,L,pSBK.147,F,L_TAA_TGA,14126,A,G,CTTTCGCAGT,AATCCCATGA,TGTAAGTTCCGCAATAACGT,CGATGTGCGCCAGCGGAGTC,msSBK_34_01,msSBK_34_25,msCF_02_134
1,Lambda,L,pSBK.148,R,L_TAA_TGA,14126,A,G,CTTTCGCAGT,AATCCCATGA,TGTAAGTTCCGCAATAACGT,CGATGTGCGCCAGCGGAGTC,msSBK_34_02,msSBK_34_26,msCF_02_135


In [4]:
##Globals
outcomes_dict = {}

In [5]:
##Defs
def extract_and_match(read,index,rep):
    left_outside = fuzzysearch.find_near_matches(samples.iloc[index]["L_outside"],read,max_l_dist=2)
    right_outside = fuzzysearch.find_near_matches(samples.iloc[index]["R_outside"],read,max_l_dist=2)
    if len(left_outside) == 1 and len(right_outside) == 1:
        left_inside = fuzzysearch.find_near_matches(samples.iloc[index]["L_inside"],read,max_l_dist=1)
        right_inside = fuzzysearch.find_near_matches(samples.iloc[index]["R_inside"],read,max_l_dist=1)
        if len(left_inside) == 1 and len(right_inside) == 1:
            var_nt = read[left_inside[0].end:right_inside[0].start]
            if var_nt == samples.iloc[index]["WT nt"]:
                return 'wt'
            elif var_nt == samples.iloc[index]["Edited nt"]:
                return 'edited'
            else:
                return 'unmatched_edit_nt'
        else:
            return 'unmatched_region'
    else:
        return 'unmatched_region'

In [6]:
#step through samples
for i in samples.index:
    sample_i = int(i)
    outcomes_dict[i] = {}
    for rep in ["Rep1","Rep2","Rep3"]:
        outcomes_dict[i][rep]= {'wt':0,'edited':0,'unmatched_region':0,'unmatched_edit_nt':0}
        all_reads_str = []
        read_counter = []
        fastq_reads = "./%s.fastq" % samples[rep][i]
        try:
            for seq_record in SeqIO.parse(fastq_reads, "fastq"):
                all_reads_str.append(str(seq_record.seq))
            read_counter = Counter(all_reads_str)
            for read in read_counter:
                outcomes_dict[sample_i][rep][extract_and_match(read,i,rep)] += read_counter[read]
            print(samples[rep][i])
        except IOError: #this happens when a file is missing
            print("%s missing" % samples[rep][i])

msSBK_34_01
msSBK_34_25
msCF_02_134
msSBK_34_02
msSBK_34_26
msCF_02_135


In [7]:
##calculate summary data
Rep1_Ed_Per = []
Rep2_Ed_Per = []
Rep3_Ed_Per = []

Rep1_No_Region_Per = []
Rep2_No_Region_Per = []
Rep3_No_Region_Per = []

Rep1_No_Site_Per = []
Rep2_No_Site_Per = []
Rep3_No_Site_Per = []

for i in samples.index:
    try:
        Rep1_Ed_Per.append((float(outcomes_dict[i]['Rep1']['edited']) / (outcomes_dict[i]['Rep1']['edited']+outcomes_dict[i]['Rep1']['wt']))*100)
    except ZeroDivisionError:
        Rep1_Ed_Per.append("div0")
    try:
        Rep2_Ed_Per.append((float(outcomes_dict[i]['Rep2']['edited']) / (outcomes_dict[i]['Rep2']['edited']+outcomes_dict[i]['Rep2']['wt']))*100)
    except ZeroDivisionError:
        Rep2_Ed_Per.append("div0")
    try:
        Rep3_Ed_Per.append((float(outcomes_dict[i]['Rep3']['edited']) / (outcomes_dict[i]['Rep3']['edited']+outcomes_dict[i]['Rep3']['wt']))*100)
    except ZeroDivisionError:
        Rep3_Ed_Per.append("div0")
    
    try:
        Rep1_No_Region_Per.append((float(outcomes_dict[i]['Rep1']['unmatched_region']) / (outcomes_dict[i]['Rep1']['edited']+outcomes_dict[i]['Rep1']['wt']+outcomes_dict[i]['Rep1']['unmatched_region']+outcomes_dict[i]['Rep1']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep1_No_Region_Per.append("div0")
    try:
        Rep2_No_Region_Per.append((float(outcomes_dict[i]['Rep2']['unmatched_region']) / (outcomes_dict[i]['Rep2']['edited']+outcomes_dict[i]['Rep2']['wt']+outcomes_dict[i]['Rep2']['unmatched_region']+outcomes_dict[i]['Rep2']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep2_No_Region_Per.append("div0")
    try:
        Rep3_No_Region_Per.append((float(outcomes_dict[i]['Rep3']['unmatched_region']) / (outcomes_dict[i]['Rep3']['edited']+outcomes_dict[i]['Rep3']['wt']+outcomes_dict[i]['Rep3']['unmatched_region']+outcomes_dict[i]['Rep3']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep3_No_Region_Per.append("div0")
    
    try:
        Rep1_No_Site_Per.append((float(outcomes_dict[i]['Rep1']['unmatched_edit_nt']) / (outcomes_dict[i]['Rep1']['edited']+outcomes_dict[i]['Rep1']['wt']+outcomes_dict[i]['Rep1']['unmatched_region']+outcomes_dict[i]['Rep1']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep1_No_Site_Per.append("div0")
    try:    
        Rep2_No_Site_Per.append((float(outcomes_dict[i]['Rep2']['unmatched_edit_nt']) / (outcomes_dict[i]['Rep2']['edited']+outcomes_dict[i]['Rep2']['wt']+outcomes_dict[i]['Rep2']['unmatched_region']+outcomes_dict[i]['Rep2']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep2_No_Site_Per.append("div0")
    try:
        Rep3_No_Site_Per.append((float(outcomes_dict[i]['Rep3']['unmatched_edit_nt']) / (outcomes_dict[i]['Rep3']['edited']+outcomes_dict[i]['Rep3']['wt']+outcomes_dict[i]['Rep3']['unmatched_region']+outcomes_dict[i]['Rep3']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep3_No_Site_Per.append("div0")

In [8]:
samples_updated = samples.assign(Rep1_Edit_Percent=Rep1_Ed_Per, 
                                 Rep2_Edit_Percent=Rep2_Ed_Per, 
                                 Rep3_Edit_Percent=Rep3_Ed_Per,
                                 _R1_No_Region_Match_Percent=Rep1_No_Region_Per,
                                 _R2_No_Region_Match_Percent=Rep2_No_Region_Per,
                                 _R3_No_Region_Match_Percent=Rep3_No_Region_Per,
                                 __R1_No_Site_Match_Percent=Rep1_No_Site_Per,
                                 __R2_No_Site_Match_Percent=Rep2_No_Site_Per,
                                 __R3_No_Site_Match_Percent=Rep3_No_Site_Per)

In [9]:
samples_updated

Unnamed: 0,Phage,Gene,Plasmid,Direction,edit name,Genome Position,WT nt,Edited nt,L_inside,R_inside,...,Rep3,Rep1_Edit_Percent,Rep2_Edit_Percent,Rep3_Edit_Percent,_R1_No_Region_Match_Percent,_R2_No_Region_Match_Percent,_R3_No_Region_Match_Percent,__R1_No_Site_Match_Percent,__R2_No_Site_Match_Percent,__R3_No_Site_Match_Percent
0,Lambda,L,pSBK.147,F,L_TAA_TGA,14126,A,G,CTTTCGCAGT,AATCCCATGA,...,msCF_02_134,27.06272,26.115981,20.278254,1.406785,2.480698,3.034029,0.097863,0.067915,0.157278
1,Lambda,L,pSBK.148,R,L_TAA_TGA,14126,A,G,CTTTCGCAGT,AATCCCATGA,...,msCF_02_135,21.116465,25.431555,15.916106,0.899292,13.225325,1.633095,0.058438,0.047112,0.172252


In [10]:
##output
samples_updated.to_excel("editing_analysis_demo_output.xlsx")