In [1]:
##Import
from Bio import SeqIO
import pandas as pd
from collections import Counter
import fuzzysearch
import matplotlib as plt
plt.rcParams['pdf.fonttype'] = 42

In [2]:
##Load key of conditions and files
samples = pd.read_excel("Phage_genome_screen_key.xls")[["Phage","Gene","Plasmid","Direction","edit name","Genome Position","WT nt","Edited nt","L_inside","R_inside","L_outside","R_outside","Rep1","Rep2","Rep3"]]

In [3]:
samples

Unnamed: 0,Phage,Gene,Plasmid,Direction,edit name,Genome Position,WT nt,Edited nt,L_inside,R_inside,L_outside,R_outside,Rep1,Rep2,Rep3
0,T2,rnh,pSBK.149,F,rnh_TGA_TAA,143279,C,T,TTATTCACCT,AAAATTCATT,GCCAAGTCTTCGCCATGGAT,ACAAAATATGAATAAATTTT,msCF_01_01,msCF_01_13,msCF_01_25
1,T2,rnh,pSBK.150,R,rnh_TGA_TAA,143279,C,T,TTATTCACCT,AAAATTCATT,GCCAAGTCTTCGCCATGGAT,ACAAAATATGAATAAATTTT,msCF_01_02,msCF_01_14,msCF_01_26
2,T2,rnh,pSBK.161,R_dRT,rnh_TGA_TAA,143279,C,T,TTATTCACCT,AAAATTCATT,GCCAAGTCTTCGCCATGGAT,ACAAAATATGAATAAATTTT,msCF_01_03,msCF_01_15,msCF_01_27
3,T2,modA,pSBK.175,F,modA_TAA_TGA,10542,T,C,GTCCTTCCAT,ATAGATTAAA,TGTAAACAACTTTATGTAAA,GAACACTTTTTGTACTCATA,msCF_01_04,msCF_01_16,msCF_01_28
4,T2,modA,pSBK.176,R,modA_TAA_TGA,10542,T,C,GTCCTTCCAT,ATAGATTAAA,TGTAAACAACTTTATGTAAA,GAACACTTTTTGTACTCATA,msCF_01_05,msCF_01_17,msCF_01_29
5,T2,DNK,pSBK.177,F,DNK_TAA_TGA,71203,T,C,GTTCAGACAT,ATAGTACCTT,GAATGCGAGATTTCAGAGTT,TAGAACCATCGTTTGTAATT,msCF_01_06,msCF_01_18,msCF_01_30
6,T2,DNK,pSBK.178,R,DNK_TAA_TGA,71203,T,C,GTTCAGACAT,ATAGTACCTT,GAATGCGAGATTTCAGAGTT,TAGAACCATCGTTTGTAATT,msCF_01_07,msCF_01_19,msCF_01_31
7,T2,sigma,pSBK.179,F,sigma_TAA_TGA,37530,T,C,CCGGAGAGGT,AATCGTTAGC,TACATTTTAAGTAAATAGGG,CAACACTTTTCTTTTCAGCC,msCF_01_08,msCF_01_20,msCF_01_32
8,T2,sigma,pSBK.180,R,sigma_TAA_TGA,37530,T,C,CCGGAGAGGT,AATCGTTAGC,TACATTTTAAGTAAATAGGG,CAACACTTTTCTTTTCAGCC,msCF_01_09,msCF_01_21,msCF_01_33
9,T2,uvsW,pSBK.181,F,uvsW_TAA_TGA,108831,A,G,GTTAATTTAT,AGGGCTTCGG,CATTGATCGTATTCAGCGCT,CAATTTCTTTATGAAGCTTC,msCF_01_10,msCF_01_22,msCF_01_34


In [4]:
##Globals
outcomes_dict = {}

In [5]:
##Defs
def extract_and_match(read,index,rep):
    left_outside = fuzzysearch.find_near_matches(samples.iloc[index]["L_outside"],read,max_l_dist=2)
    right_outside = fuzzysearch.find_near_matches(samples.iloc[index]["R_outside"],read,max_l_dist=2)
    if len(left_outside) == 1 and len(right_outside) == 1:
        left_inside = fuzzysearch.find_near_matches(samples.iloc[index]["L_inside"],read,max_l_dist=1)
        right_inside = fuzzysearch.find_near_matches(samples.iloc[index]["R_inside"],read,max_l_dist=1)
        if len(left_inside) == 1 and len(right_inside) == 1:
            var_nt = read[left_inside[0].end:right_inside[0].start]
            if var_nt == samples.iloc[index]["WT nt"]:
                return 'wt'
            elif var_nt == samples.iloc[index]["Edited nt"]:
                return 'edited'
            else:
                return 'unmatched_edit_nt'
        else:
            return 'unmatched_region'
    else:
        return 'unmatched_region'

In [6]:
#step through samples
for i in samples.index:
    sample_i = int(i)
    outcomes_dict[i] = {}
    for rep in ["Rep1","Rep2","Rep3"]:
        outcomes_dict[i][rep]= {'wt':0,'edited':0,'unmatched_region':0,'unmatched_edit_nt':0}
        all_reads_str = []
        read_counter = []
        fastq_reads = "./%s_trimmed.fastq" % samples[rep][i]
        try:
            for seq_record in SeqIO.parse(fastq_reads, "fastq"):
                all_reads_str.append(str(seq_record.seq))
            read_counter = Counter(all_reads_str)
            for read in read_counter:
                outcomes_dict[sample_i][rep][extract_and_match(read,i,rep)] += read_counter[read]
            print(samples[rep][i])
        except IOError: #this happens when a file is missing
            print("%s missing" % samples[rep][i])

msCF_01_01
msCF_01_13
msCF_01_25
msCF_01_02
msCF_01_14
msCF_01_26
msCF_01_03
msCF_01_15
msCF_01_27
msCF_01_04
msCF_01_16
msCF_01_28
msCF_01_05
msCF_01_17
msCF_01_29
msCF_01_06
msCF_01_18
msCF_01_30
msCF_01_07
msCF_01_19
msCF_01_31
msCF_01_08
msCF_01_20
msCF_01_32
msCF_01_09
msCF_01_21
msCF_01_33
msCF_01_10
msCF_01_22
msCF_01_34
msCF_01_11
msCF_01_23
msCF_01_35
msCF_01_46
msCF_01_58
msCF_01_70
msCF_01_47
msCF_01_59
msCF_01_71
msCF_01_37
msCF_01_49
msCF_01_61
msCF_01_42
msCF_01_54
msCF_01_66
msCF_01_38
msCF_01_50
msCF_01_62
msCF_01_43
msCF_01_55
msCF_01_67
msCF_01_39
msCF_01_51
msCF_01_63
msCF_01_44
msCF_01_56
msCF_01_68
msCF_01_40
msCF_01_52
msCF_01_64
msCF_01_45
msCF_01_57
msCF_01_69
msCF_01_41
msCF_01_53
msCF_01_65
msCF_01_73
msCF_01_85
msCF_01_97
msCF_01_74
msCF_01_86
msCF_01_98
msCF_01_76
msCF_01_88
msCF_01_100
msCF_01_77
msCF_01_89
msCF_01_101
msCF_01_78
msCF_01_90
msCF_01_102
msCF_01_79
msCF_01_91
msCF_01_103
msCF_01_80
msCF_01_92
msCF_01_104
msCF_01_81
msCF_01_93
msCF_01_105
msCF

In [8]:
##calculate summary data
Rep1_Ed_Per = []
Rep2_Ed_Per = []
Rep3_Ed_Per = []

Rep1_No_Region_Per = []
Rep2_No_Region_Per = []
Rep3_No_Region_Per = []

Rep1_No_Site_Per = []
Rep2_No_Site_Per = []
Rep3_No_Site_Per = []

for i in samples.index:
    try:
        Rep1_Ed_Per.append((float(outcomes_dict[i]['Rep1']['edited']) / (outcomes_dict[i]['Rep1']['edited']+outcomes_dict[i]['Rep1']['wt']))*100)
    except ZeroDivisionError:
        Rep1_Ed_Per.append("div0")
    try:
        Rep2_Ed_Per.append((float(outcomes_dict[i]['Rep2']['edited']) / (outcomes_dict[i]['Rep2']['edited']+outcomes_dict[i]['Rep2']['wt']))*100)
    except ZeroDivisionError:
        Rep2_Ed_Per.append("div0")
    try:
        Rep3_Ed_Per.append((float(outcomes_dict[i]['Rep3']['edited']) / (outcomes_dict[i]['Rep3']['edited']+outcomes_dict[i]['Rep3']['wt']))*100)
    except ZeroDivisionError:
        Rep3_Ed_Per.append("div0")
    
    try:
        Rep1_No_Region_Per.append((float(outcomes_dict[i]['Rep1']['unmatched_region']) / (outcomes_dict[i]['Rep1']['edited']+outcomes_dict[i]['Rep1']['wt']+outcomes_dict[i]['Rep1']['unmatched_region']+outcomes_dict[i]['Rep1']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep1_No_Region_Per.append("div0")
    try:
        Rep2_No_Region_Per.append((float(outcomes_dict[i]['Rep2']['unmatched_region']) / (outcomes_dict[i]['Rep2']['edited']+outcomes_dict[i]['Rep2']['wt']+outcomes_dict[i]['Rep2']['unmatched_region']+outcomes_dict[i]['Rep2']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep2_No_Region_Per.append("div0")
    try:
        Rep3_No_Region_Per.append((float(outcomes_dict[i]['Rep3']['unmatched_region']) / (outcomes_dict[i]['Rep3']['edited']+outcomes_dict[i]['Rep3']['wt']+outcomes_dict[i]['Rep3']['unmatched_region']+outcomes_dict[i]['Rep3']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep3_No_Region_Per.append("div0")
    
    try:
        Rep1_No_Site_Per.append((float(outcomes_dict[i]['Rep1']['unmatched_edit_nt']) / (outcomes_dict[i]['Rep1']['edited']+outcomes_dict[i]['Rep1']['wt']+outcomes_dict[i]['Rep1']['unmatched_region']+outcomes_dict[i]['Rep1']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep1_No_Site_Per.append("div0")
    try:    
        Rep2_No_Site_Per.append((float(outcomes_dict[i]['Rep2']['unmatched_edit_nt']) / (outcomes_dict[i]['Rep2']['edited']+outcomes_dict[i]['Rep2']['wt']+outcomes_dict[i]['Rep2']['unmatched_region']+outcomes_dict[i]['Rep2']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep2_No_Site_Per.append("div0")
    try:
        Rep3_No_Site_Per.append((float(outcomes_dict[i]['Rep3']['unmatched_edit_nt']) / (outcomes_dict[i]['Rep3']['edited']+outcomes_dict[i]['Rep3']['wt']+outcomes_dict[i]['Rep3']['unmatched_region']+outcomes_dict[i]['Rep3']['unmatched_edit_nt']))*100)
    except ZeroDivisionError:
        Rep3_No_Site_Per.append("div0")

In [9]:
samples_updated = samples.assign(Rep1_Edit_Percent=Rep1_Ed_Per, 
                                 Rep2_Edit_Percent=Rep2_Ed_Per, 
                                 Rep3_Edit_Percent=Rep3_Ed_Per,
                                 _R1_No_Region_Match_Percent=Rep1_No_Region_Per,
                                 _R2_No_Region_Match_Percent=Rep2_No_Region_Per,
                                 _R3_No_Region_Match_Percent=Rep3_No_Region_Per,
                                 __R1_No_Site_Match_Percent=Rep1_No_Site_Per,
                                 __R2_No_Site_Match_Percent=Rep2_No_Site_Per,
                                 __R3_No_Site_Match_Percent=Rep3_No_Site_Per)

In [10]:
samples_updated

Unnamed: 0,Phage,Gene,Plasmid,Direction,edit name,Genome Position,WT nt,Edited nt,L_inside,R_inside,...,Rep3,Rep1_Edit_Percent,Rep2_Edit_Percent,Rep3_Edit_Percent,_R1_No_Region_Match_Percent,_R2_No_Region_Match_Percent,_R3_No_Region_Match_Percent,__R1_No_Site_Match_Percent,__R2_No_Site_Match_Percent,__R3_No_Site_Match_Percent
0,T2,rnh,pSBK.149,F,rnh_TGA_TAA,143279,C,T,TTATTCACCT,AAAATTCATT,...,msCF_01_25,0.0394089,0.0306861,0.0666844,24.990769,22.006411,24.0753,0.051695,0.043056,0.0455396
1,T2,rnh,pSBK.150,R,rnh_TGA_TAA,143279,C,T,TTATTCACCT,AAAATTCATT,...,msCF_01_26,0.0247693,0.0244648,0.0474965,14.623963,18.674819,23.1709,0.026426,0.054677,0.0638065
2,T2,rnh,pSBK.161,R_dRT,rnh_TGA_TAA,143279,C,T,TTATTCACCT,AAAATTCATT,...,msCF_01_27,0.0475737,0.0132433,0.0240073,14.820385,15.606323,18.8131,0.063632,0.0391,0.0311733
3,T2,modA,pSBK.175,F,modA_TAA_TGA,10542,T,C,GTCCTTCCAT,ATAGATTAAA,...,msCF_01_28,0.00415973,0.020087,0.0304337,9.415132,12.168597,16.2705,0.048959,0.035271,0.0424484
4,T2,modA,pSBK.176,R,modA_TAA_TGA,10542,T,C,GTCCTTCCAT,ATAGATTAAA,...,msCF_01_29,0.00700378,0.00745434,0,13.072055,10.129209,31.9749,0.036514,0.060253,0.0169808
5,T2,DNK,pSBK.177,F,DNK_TAA_TGA,71203,T,C,GTTCAGACAT,ATAGTACCTT,...,msCF_01_30,0.0333667,0.00521241,0.0776699,2.503454,2.488826,2.23848,0.056897,0.06603,0.0650407
6,T2,DNK,pSBK.178,R,DNK_TAA_TGA,71203,T,C,GTTCAGACAT,ATAGTACCTT,...,msCF_01_31,0.0355215,0.0116788,0.0726744,2.458108,2.280632,2.65588,0.076167,0.079822,0.0353593
7,T2,sigma,pSBK.179,F,sigma_TAA_TGA,37530,T,C,CCGGAGAGGT,AATCGTTAGC,...,msCF_01_32,0.0520833,0.0896308,0.0893017,2.120763,1.888864,2.72443,0.083349,0.093345,0.115687
8,T2,sigma,pSBK.180,R,sigma_TAA_TGA,37530,T,C,CCGGAGAGGT,AATCGTTAGC,...,msCF_01_33,0.0845473,0.0521614,0.030861,2.52202,2.662609,2.53894,0.044357,0.107772,0.100155
9,T2,uvsW,pSBK.181,F,uvsW_TAA_TGA,108831,A,G,GTTAATTTAT,AGGGCTTCGG,...,msCF_01_34,0.0347602,0.154087,0.0697483,2.194068,1.849764,1.92684,0.053395,0.05758,0.0466171


In [11]:
##graph it

In [12]:
##output
samples_updated.to_excel("editing_analysis_output.xlsx")