In [134]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import scipy.stats as stats
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from Bio.SeqUtils import GC
import numpy as np
from collections import Counter, OrderedDict
from os import listdir
from itertools import product

In [135]:
lsize=14
titlesize=16
axesthickness = 1.5
flierprops = dict(marker='o',markersize=1.5)
boxplot_kwargs = {'widths':0.65, 'vert':True, 'patch_artist':True, 'boxprops':dict(facecolor='#FEFBF3'), 'whis':[5, 95]}
bar_kwargs = {'color':'#C5DEFF', 'edgecolor':'#000000', 'capsize':4, 'linewidth':1.5}
plt.rcParams['axes.linewidth'] = axesthickness
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [160]:
#editor to analyze below. To run each editor run the whole notebook with only the relevant editor uncommented.
# editor_class = 'ISAam1'
editor_class = 'ISYmu1'


In [161]:
#path='C:\\Users\\nimath\\switchdrive\\Institution\\PhD\\01_Experiments\\05_Various\\07_Kim_Marquart\\TnpB\\analysis\\AnalysisFiles\\'
path='./fastq_twist4/'+editor_class+'/AnalysisFiles/'

def list_files1(directory):
    return [f for f in listdir(directory) if '.csv' in f]

sampleinfodf = pd.read_excel('./sample_files/Sample_Info_'+editor_class+'.xlsx')
sampleinfodf = sampleinfodf.set_index('short_name')

sampledf = pd.DataFrame(columns=['filename','Library','Cell_line','Editor','Replicate'])
filelist = list_files1(path)
sampledf['filename'] = filelist
if editor_class == 'ISYmu1':
    sampledf['shortname'] = sampledf['filename'].apply(lambda x: x.split('_')[1])
else:
    sampledf['shortname'] = sampledf['filename'].apply(lambda x: '_'.join(x.split('-')[-1].split('_')[:-1]))
sampledf = sampledf.set_index('shortname')
sampledf['Library'] = 'twist4'
sampledf['Cell_line'] = sampleinfodf['cell_line']
sampledf['Editor'] = sampleinfodf['editor']
# rename none to "CTRL"
sampledf['Editor'] = sampledf['Editor'].apply(lambda x: 'CTRL' if x == 'none' else x)
sampledf['Replicate'] = sampleinfodf['replicate'].astype(str)
sampledf['Control'] = sampleinfodf['control_yes_no']
sampledf['repname'] = sampledf.apply(lambda x: x.Editor+"_"+x.Replicate,axis=1)

sampledf = sampledf.sort_values(by=['repname'])

# only keep rep 1-3 (too many additional control reps, not needed for analysis)
sampledf = sampledf[sampledf['Replicate'].isin(['1','2','3'])]

In [163]:
templatedf = pd.read_csv('./python_analysis_scripts/20240405_TnpB_'+editor_class+'_100member_REFERENCE.csv')

In [166]:
ctrllist = sampledf[sampledf['Editor'] == 'CTRL']
ctrldf = templatedf.copy()
for index, row in ctrllist.iterrows():
    controlrepdf = pd.read_csv(path+row.filename)
    ctrldf[row.repname+'_totalreads'] = controlrepdf['totalreads']
    controlrepdf['percentageindel'] = controlrepdf.apply(lambda x: (x.indelcount/x.totalreads)*100 ,axis=1)
    controlrepdf['percentageunedited'] = controlrepdf.apply(lambda x: (x.uneditedcount/x.totalreads)*100 ,axis=1)
    ctrldf[row.repname+'_percentageunedited'] = controlrepdf['percentageunedited']
    ctrldf[row.repname+'_percentageindel'] = controlrepdf['percentageindel']
    ctrldf[row.repname+'_uneditedcount'] = controlrepdf['uneditedcount']
    ctrldf[row.repname+'_indelcount'] = controlrepdf['indelcount']

for index, row in ctrllist.iterrows():    
    ctrldf = ctrldf[ctrldf[row.repname+'_totalreads'] > 100]
    ctrldf = ctrldf[ctrldf[row.repname+'_percentageindel'] < 15]

In [126]:
editorlist = sampledf['Editor'].unique()
editorlist = [x for x in editorlist if not 'CTRL' in x]
experimentdict = {}
for editor in editorlist:
    experimentdf = templatedf.copy()
    for index, row in sampledf[sampledf['Editor'] == editor].iterrows():
        dataframe = pd.read_csv(path+row.filename)
        dataframename = row.Editor+"_"+row.Replicate
        experimentdf[dataframename+'_totalreads'] = dataframe['totalreads']
        dataframe['percentageindel'] = dataframe.apply(lambda x: (x.indelcount/x.totalreads)*100 ,axis=1)
        dataframe['percentageunedited'] = dataframe.apply(lambda x: (x.uneditedcount/x.totalreads)*100 ,axis=1)
        experimentdf[dataframename+'_percentageunedited'] = dataframe['percentageunedited']
        experimentdf[dataframename+'_percentageindel'] = dataframe['percentageindel']
        experimentdf[dataframename+'_uneditedcount'] = dataframe['uneditedcount']
        experimentdf[dataframename+'_indelcount'] = dataframe['indelcount']
    replicates = list(sampledf[sampledf['Editor'] == editor].repname)
    controldf = sampledf[(sampledf['Editor'] == 'CTRL')]
    #controlname = controldf['Cell_line_Editor'] + '_' + controldf['Replicate']
    
    ctrluneditedlist = []
    ctrlindellist = []
    lib = "twist4"
    for index, row in ctrllist.iterrows():
        ctrluneditedlist.append(row.repname+'_percentageunedited')
        ctrlindellist.append(row.repname+'_percentageindel')
        
    experimentdf[lib+'_ctr'+'_percentageunedited'] = ctrldf[ctrluneditedlist].mean(axis=1)
    experimentdf[lib+'_ctr'+'_percentageindel'] = ctrldf[ctrlindellist].mean(axis=1)

    cutoff = 100
    initialtemplen = len(experimentdf)

    for replicate in replicates:
        experimentdf = experimentdf[experimentdf[replicate+'_totalreads']>cutoff]

    experimentdict[editor] = experimentdf

### Calculate editing characteristics for all experimental setups
Also clip editing levels to be between 0-100.

In [128]:
sampledf_woctr = sampledf[sampledf['Editor'] != 'CTRL']

In [None]:
editordf = sampledf_woctr.copy()
editordf = editordf[editordf['Replicate'] == "1"]
editordf = editordf.set_index('Editor')

cols = ['Cell_line', 'Library']
editordf = editordf[cols]
editordf

In [None]:
for editor in experimentdict:
    print(editor)
    replicatelist = list(sampledf_woctr[sampledf_woctr['Editor'] == editor]['repname'])
    ctrname = lib+'_ctr'
    
    for replicate in replicatelist:
        print(replicate)
        repnr = replicate.split('_')[-1]
        experimentdict[editor][replicate+'_percentageindel_corrected_individual'] = experimentdict[editor].apply(lambda x: ((x[replicate+'_percentageindel']-x[ctrname+'_percentageindel'])/((100-x[ctrname+'_percentageindel'])/100)),axis=1)
        editordf.at[editor,repnr +'_meanindel'] = None
        editordf.at[editor,repnr +'_meanindel'] = experimentdict[editor][[replicate +'_percentageindel_corrected_individual']].mean()
        
    allreps = []
    for index,row in sampledf_woctr[sampledf_woctr['Editor'] == editor].iterrows():
        rep = row.Editor+'_'+row.Replicate
        allreps.append(rep)
        
    print(allreps)    
    experimentdict[editor][editor+'_averageindel'] = experimentdict[editor][[x +'_percentageindel_corrected_individual' for x in allreps]].mean(axis=1)
    
       
    # Clip editing rates to be between 0 and 100
    experimentdict[editor][editor+'_averageindel'] = experimentdict[editor][editor+'_averageindel'].clip(0,100)
    
    experimentdict[editor] = experimentdict[editor].dropna(subset=[editor+'_averageindel'])
    

    editordf.at[editor,'numberofvariants'] = len(experimentdict[editor])
    editordf.at[editor,'meanindel'] = experimentdict[editor][editor+'_averageindel'].mean()
    editordf.at[editor,'standarddeviation'] = experimentdict[editor][editor+'_averageindel'].std()
    experimentdict[editor].to_csv('./fastq_twist4/'+editor_class+'/AnalysisFiles/summary/20240416_'+editor+'_analysis_dataframe.csv')

In [131]:
for index, row in editordf.iterrows():
    editordf.at[index,'replicate_meanindel'] = (row['1_meanindel'] + row['2_meanindel'] + row['3_meanindel']) / 3
    editordf.at[index,'replicate_std'] = np.std([row['1_meanindel'],row['2_meanindel'],row['3_meanindel']])

In [None]:
editordf = editordf.sort_index()
editordf['filename'] = editordf.apply(lambda x: '20240416_'+str(x.name)+'_analysis_dataframe.csv',axis=1)
editordf.to_csv('./fastq_twist4/'+editor_class+'/AnalysisFiles/summary/20240416_'+editor_class+'_100member_nuclease_summary_mean_std.csv')
editordf