In [1]:
# These are the files that come out of preprocess_redi.py
files = ['NEG.filtered_edits.csv', 'cLIDAR.filtered_edits.csv', 'eLIDAR.filtered_edits.csv', 'gLIDAR.filtered_edits.csv', 'AD2.filtered_edits.csv']
#files = ['NG4AVP.filtered_edits.csv','noAVP.filtered_edits.csv','AVP.filtered_edits.csv']
num_files=len(files)
base_dir='./data/'

In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.stats.multitest as mt

edit_map = {}
sample = 0
conds=1 #hardcoded # could be int(num_files/3) if using triplicates,etc

print("# of files:", num_files)

for file in files:
    file = base_dir+file
    print(file)
    df = pd.read_csv(file, sep=",")
    for index, row in df.iterrows():
        pos = row['Position']
        reg = row['Region']

        bases = eval(row['BaseCount[A,C,G,T]'])
        sum_freq=sum(bases)

        if not reg in edit_map:
            edit_map[reg]={}
        if not pos in edit_map[reg]:
            edit_map[reg][pos]= { 'Reference': row['Reference'], 'Frequency': [0]*num_files, 'FisherM': [[0,0]]*num_files }

        # if not a minimum of 20 reads then dont consider it
        if sum_freq<20 or (row['Reference'] == 'A' and bases[2]<1) or (row['Reference'] == 'T' and bases[1]<1):
            if reg in edit_map and pos in edit_map[reg]:
                edit_map[reg][pos]['Frequency'][sample]=None
            continue
        
        if row['Reference'] == 'A':
            edit_map[reg][pos]['Frequency'][sample]=bases[2]/(bases[0]+bases[2]) #row['Frequency']
            edit_map[reg][pos]['FisherM'][sample]=[sum_freq-bases[2], bases[2]] # 2 corresponds to position G
        elif row['Reference'] == 'T':
            edit_map[reg][pos]['Frequency'][sample]=bases[1]/(bases[3]+bases[1]) #row['Frequency']
            edit_map[reg][pos]['FisherM'][sample]=[sum_freq-bases[1], bases[1]] # 1 = position for C

    sample+=1
    del df

print('First step passed\n')

# of files: 5
./data/NEG.filtered_edits.csv
./data/cLIDAR.filtered_edits.csv
./data/eLIDAR.filtered_edits.csv
./data/gLIDAR.filtered_edits.csv
./data/AD2.filtered_edits.csv
First step passed



In [3]:
means = [None]*num_files
pvaluesx=[]
pvaluesy=[]
pvals=[]
pvals_all=[]
gene=[]
tables =[]
comparison = 0
comp_2 = 4
high_edit=0
total_pos=0
e=0.00000001

for c in range(len(means)):
    means[c]=[]

for region, positions in edit_map.items():
    for position, sitedata in positions.items():
        frequencies = sitedata['Frequency']

        if region=='XZ054' and position==3740:
            print(region, position, frequencies[comparison],frequencies[comp_2], frequencies[0]>0)

        # Based on Katrekar et al (https://doi.org/10.1038/s41592-019-0323-0) protocol. 
        # Only look at sites shared by all conditions
        # Also based on Katrekar et al. If all conditions have the site and at least one is >0
        if not None in frequencies:
            total_pos+=1
            
            for c in range(len(means)):
                means[c].append(frequencies[conds*c])

            table = np.array([sitedata['FisherM'][comparison], sitedata['FisherM'][comp_2]])+e 
    
            if min(table[0])>5 and min(table[1])>5:
                fisher = stats.chi2_contingency(table)
            else:
                fisher = stats.fisher_exact(table)
            
            try:
                pvals_all.append(fisher.pvalue)
            except:
                continue

rejected, corrected = mt.multipletests(pvals_all, alpha=0.01, method='fdr_bh')[:2]

print(total_pos)
print(len(pvals_all))

3297977
3297977


In [4]:
rejected=np.array(rejected)
print(len(rejected[rejected]))

4643


In [5]:
pvaluesx=[]
pvaluesy=[]
for i in range(len(means[0])):
    if ((means[comp_2][i]/(means[comparison][i]+e)) > 1.1 or (means[comparison][i]/(means[comp_2][i]+e)) > 1.1) and rejected[i]:
        pvaluesx.append(means[comparison][i])
        pvaluesy.append(means[comp_2][i])

print('Statistically significant changes:', len(pvaluesx))

Statistically significant changes: 4641


In [None]:
from matplotlib.colors import ListedColormap
import matplotlib as mpl

# modify existing Reds colormap with a linearly fading alpha
red = plt.cm.gist_heat  # original colormap
fading_red = red(np.arange(red.N)) # extract colors
fading_red = fading_red[::-1]
fading_red[:, -1] = np.linspace(0.1, 0.9, red.N) # modify alpha
fading_red = ListedColormap(fading_red) # convert to colormap

plt.hist2d(means[comparison], means[comp_2], bins=(75,75), norm=mpl.colors.LogNorm(vmax=1000000), cmap=mpl.cm.binary)
if len(pvaluesx)>0:
    plt.hist2d(pvaluesx, pvaluesy, bins=(75,75), cmap=fading_red, norm=mpl.colors.LogNorm(vmin=1, vmax=100), range=[[0, 1], [0, 1]])
plt.xlim(0.0, 1)
plt.ylim(0.0, 1)
if 'XZ054' in edit_map:
    x_sensor = edit_map['XZ054'][3740]['Frequency'][comparison]
    y_sensor = edit_map['XZ054'][3740]['Frequency'][comp_2]
    plt.plot(x_sensor, y_sensor, marker=4, color='#07f0ff', markersize=6)
plt.colorbar()

sample_names=['NG-noAVP','NG-AVP', 'noAVP-AVP']

#plt.savefig(base_dir+'1114_'+sample_names[comp_2+comparison-1]+'.svg')
plt.show()