In [6]:
import csv
import shlex
import pandas as pd
import numpy as np
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go
import fnmatch
import plotly.io as pio
import scipy
from scipy.signal import find_peaks

In [2]:
df = pd.read_csv('/run/media/eric/analysis/genomics/PyAtBSA/output/A1.ems.table', sep="\t", header=0)

In [3]:
df['ratio'] = ((df.wt_ref)/(df.wt_ref+df.wt_alt))-((df.mu_ref)/(df.mu_ref+df.mu_alt))
ratio_mean = df["ratio"].mean()

In [4]:
chr_facets=df["chr"].unique()
df_list = []
lowess = sm.nonparametric.lowess

for i in chr_facets:
    
    df_chr = df[df['chr']==i].copy()
    
    X=df_chr['pos'].values
    
    Y=df_chr['ratio'].values
    
    y_hat = lowess(Y,X, frac=0.29)[:,1]
    
    df_chr['yhat'] = y_hat
    
    df_list.append(df_chr)
    
    signal = df_chr['yhat'].to_numpy().flatten()
    
    peaks = scipy.signal.find_peaks(signal, height=ratio_mean, 
                            threshold=None, 
                            distance=None, 
                            prominence=None, 
                            width=None, 
                            wlen=None, 
                            rel_height=0.5, 
                            plateau_size=None)
    
    h = peaks[1]['peak_heights']
    
    if len(h) > 0:
        h.sort()
        max = h[0]
        min = h[-1]
    else:
        max = 0
        min = 0
    
    df_chr['peak'] = [1 if (np.isclose(max, x) or (max < x)) and (np.isclose(min, x) or (min > x))
                      else 0 for x in df_chr['yhat']]

df = pd.concat(df_list)

df_peaks = df.loc[df['peak'] == 1]

df.to_csv('file_name.tsv', sep='\t')

<class 'pandas.core.frame.DataFrame'>
Index: 16236 entries, 0 to 16235
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   chr           16236 non-null  int64  
 1   pos           16236 non-null  int64  
 2   ref           16236 non-null  object 
 3   alt           16236 non-null  object 
 4   gene          16236 non-null  object 
 5   snpEffect     16236 non-null  object 
 6   snpVariant    1711 non-null   object 
 7   snpImpact     16236 non-null  object 
 8   mu:wt_GTpred  16236 non-null  object 
 9   wt_alt        16236 non-null  int64  
 10  wt_ref        16236 non-null  int64  
 11  mu_alt        16236 non-null  int64  
 12  mu_ref        16236 non-null  int64  
 13  ratio         16236 non-null  float64
 14  yhat          16236 non-null  float64
 15  peak          16236 non-null  int64  
dtypes: float64(2), int64(7), object(7)
memory usage: 2.1+ MB


Unnamed: 0,chr,pos,ref,alt,gene,snpEffect,snpVariant,snpImpact,mu:wt_GTpred,wt_alt,wt_ref,mu_alt,mu_ref,ratio,yhat,peak
0,1,18477,G,A,NGA3:DCL1:NGA3:DCL1:AT1G03993:NGA3-DCL1,upstream_gene_variant:upstream_gene_variant:up...,,MODIFIER:MODIFIER:MODIFIER:MODIFIER:MODIFIER:M...,0/1:0/0,34,6,53,4,0.079825,0.108712,0
1,1,62750,A,G,KCS1:CIPK9:CIPK9:CIPK9:AT1G01130,upstream_gene_variant:downstream_gene_variant:...,,MODIFIER:MODIFIER:MODIFIER:MODIFIER:MODIFIER,0/1:0/0,23,4,36,2,0.095517,0.108756,0
2,1,62764,G,A,KCS1:CIPK9:CIPK9:CIPK9:AT1G01130,upstream_gene_variant:downstream_gene_variant:...,,MODIFIER:MODIFIER:MODIFIER:MODIFIER:MODIFIER,0/1:0/0,23,4,34,2,0.092593,0.108756,0
3,1,91955,C,T,FKGP:FKGP:FKGP:FKGP:FKGP:FKGP:FKGP:FKGP:RABA3:...,missense_variant:missense_variant:5_prime_UTR_...,p.Thr69Ile:p.Thr69Ile:NaN,MODERATE:MODERATE:MODIFIER:MODIFIER:MODIFIER:M...,0/1:0/0,28,3,43,3,0.031557,0.108786,0
4,1,91962,C,T,FKGP:FKGP:FKGP:FKGP:FKGP:FKGP:FKGP:FKGP:FKGP:F...,synonymous_variant:synonymous_variant:5_prime_...,p.Ala71Ala:p.Ala71Ala:NaN,LOW:LOW:LOW:LOW:LOW:LOW:LOW:LOW:MODIFIER:MODIF...,0/1:0/0,27,3,44,1,0.077778,0.108786,0


In [7]:
chr_facets_p=df_peaks["chr"].unique()

fig = px.scatter(df, x=df['pos'], y=df['ratio'],
    facet_col="chr",
    opacity=0.8,
    color_discrete_sequence=['goldenrod'],
    trendline="lowess",
    trendline_options=dict(frac=0.29),
    trendline_color_override="blue")
            
fig.add_trace(go.Scatter(x=[1],
    y=[1],
    mode='lines',
    name='Lowess Fitted Ratio',
    line=dict(color="blue")))
            
fig.update_layout(dict(plot_bgcolor = 'white'))
fig.update_xaxes(matches=None)
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True))
fig.for_each_xaxis(lambda x: x.update(title = ''))
fig.for_each_yaxis(lambda y: y.update(title = ''))

fig.add_annotation(
    showarrow=False,
    xanchor='center',
    xref='paper', 
    x=0.5, 
    yref='paper',
    y=-0.12,
    text='Position'
)

fig.add_annotation(
    showarrow=False,
    xanchor='center',
    xref='paper', 
    x=-0.065, 
    yanchor='middle',
    yref='paper',
    y=0.6,
    textangle=-90,
    text='Ratio'
)


fig.update_xaxes(showgrid=True, 
    gridwidth=0.5, 
    gridcolor='lightgrey')

fig.update_xaxes(showline=True, 
    linewidth=1, 
    linecolor='black')

fig.update_xaxes(rangemode="tozero")

fig.update_yaxes(showgrid=True, 
    gridwidth=0.5, 
    gridcolor='lightgrey')

fig.update_yaxes(showline=True, 
    linewidth=1, 
    linecolor='black')

fig.update_yaxes(range=[0, 0.8])

fig.update_layout(title=dict(text="Linkage map",
    font=dict(color='black')))

fig.update_traces(marker=dict(size=1))

fig['data'][0]['showlegend'] = True
fig['data'][0]['name'] = 'Polymorphism ratio'

max_pos=df_peaks['pos'].max()
min_pos=df_peaks['pos'].min()
max_yhat_pos=df_peaks.loc[df_peaks['yhat'] == df_peaks['yhat'].max()]['pos'].values[0]

for i in chr_facets_p:
    fig.add_vrect(x0=min_pos, x1=max_pos, col=i,
        fillcolor="red", opacity=0.2, line_width=0)
    fig.add_vline(x=max_yhat_pos, 
                  line_dash="dot", 
                  col=i, 
                  line_width=1)
    
fig.add_trace(go.Scatter(x=[0,0], 
    y=[0,0], 
    mode='lines', 
    line=dict(color='black', width=1, dash='dot'),
    name='Identified Peak',))

fig.show()
df_peaks.to_csv('peaks_file_name.tsv', sep='\t')


