### Take 2 deseq2 files and plot l2fc vs l2fc

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
import math

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

import plotly.express as px
from sklearn.decomposition import PCA

In [None]:
# Setup
sns.set_style("whitegrid")

abs_l2fc_threshold = np.log2(1.5)
padj_threshold = 0.05
outdir = 'l2fc_vs_l2fc_plot_results'
image_formats = ('png', 'svg', 'eps')

gene_lookup_file = '../pipeline_results_stefan_rna_seq/expression_data_pipeline_format/gene_id_gene_name_lookup_table.tsv.gz'

deseq_data_file1 = '../pipeline_results_stefan_rna_seq/deseq2/comparison1__COMPETENCE1_Sex__Not_competent_vs_Competent/Comparison_Competent_vs_Not_competent/Comparison_Competent_vs_Not_competent.deseq2_results.tsv'
#deseq_data_file = '../pipeline_results_stefan_rna_seq/deseq2/comparison2__COMPETENCE2_Sex__Not_competent_vs_Competent/Comparison_Competent_vs_Not_competent/Comparison_Competent_vs_Not_competent.deseq2_results.tsv'
deseq_data_file2 = '../pipeline_results_stefan_rna_seq/deseq2/comparison3__TREATED_Cell_line__Not_treated_vs_Treated/Comparison_Treated_vs_Not_treated/Comparison_Treated_vs_Not_treated.deseq2_results.tsv'

In [None]:
print(f'Reading in DESeq2 data file1: {deseq_data_file1}')
deseq_data1= pd.read_csv(deseq_data_file1, sep='\t')
print(f'{deseq_data1.shape[0]} genes reported')

print(f'Reading in DESeq2 data file2: {deseq_data_file2}')
deseq_data2 = pd.read_csv(deseq_data_file2, sep='\t')
print(f'{deseq_data2.shape[0]} genes reported')

if gene_lookup_file is not None:
    print(f'Reading in gene lookups file: {gene_lookup_file}')
    gene_lookups = pd.read_csv(gene_lookup_file, sep='\t')
    print(f'{gene_lookups.shape[0]} gene lookups reported')


In [None]:
deseq_data1.head(2)

In [None]:
deseq_data2.head(2)

In [None]:
# Format and merge
deseq_data1_columns = deseq_data1.columns.to_series()
deseq_data1_columns[1:] = 'comparison1_' + deseq_data1_columns[1:]
deseq_data1.columns = deseq_data1_columns

deseq_data2_columns = deseq_data2.columns.to_series()
deseq_data2_columns[1:] = 'comparison2_' + deseq_data2_columns[1:]
deseq_data2.columns = deseq_data2_columns

deseq_data_combined = pd.merge(deseq_data1, deseq_data2, on='region', how='inner')
deseq_data_combined['comparison1_minus_log10(padj)'] = -np.log10(deseq_data_combined['comparison1_padj'])
deseq_data_combined['comparison2_minus_log10(padj)'] = -np.log10(deseq_data_combined['comparison2_padj'])
del(deseq_data1, deseq_data2)

In [None]:
# Identify DEGs
degs1 = (deseq_data_combined
        .query('abs(comparison1_log2FoldChange) >= @abs_l2fc_threshold')
        .query('comparison1_padj <= @padj_threshold')
        .loc[:, 'region']
       )
    
degs2 = (deseq_data_combined
        .query('abs(comparison2_log2FoldChange) >= @abs_l2fc_threshold')
        .query('comparison2_padj <= @padj_threshold')
        .loc[:, 'region']
       )

degs = pd.concat([degs1, degs2]).drop_duplicates()

print(f'{len(degs)} DEGs')

del(degs1, degs2)

In [None]:
# Filter for degs
filt = deseq_data_combined['region'].isin(degs)
deseq_data_combined = deseq_data_combined[filt]

In [None]:
# Extract data needed for graph
graph_data = deseq_data_combined.loc[:, ['region', 'comparison1_log2FoldChange', 'comparison2_log2FoldChange']]

In [None]:
graph_data.head(2)

In [None]:
# Identify consisten changes

filt = (graph_data['comparison1_log2FoldChange'] > 0) & (graph_data['comparison2_log2FoldChange'] > 0)
graph_data.loc[filt, 'Change'] = 'UP'

filt = (graph_data['comparison1_log2FoldChange'] < 0) & (graph_data['comparison2_log2FoldChange'] < 0)
graph_data.loc[filt, 'Change'] = 'DOWN'

filt = graph_data['Change'].isna()
graph_data.loc[filt, 'Change'] = 'NOT_CONSISTENT'

In [None]:
# Identify 

log2FoldChange_off_scale = 10
minus_log10_padj_off_scale = 10

graph_data['Off_scale'] = False

filt = graph_data['comparison1_log2FoldChange'] > log2FoldChange_off_scale
graph_data.loc[filt, 'comparison1_log2FoldChange'] = log2FoldChange_off_scale
graph_data.loc[filt, 'Off_scale'] = True

filt = graph_data['comparison1_log2FoldChange'] < -log2FoldChange_off_scale
graph_data.loc[filt, 'comparison1_log2FoldChange'] = -log2FoldChange_off_scale
graph_data.loc[filt, 'Off_scale'] = True

filt = graph_data['comparison2_log2FoldChange'] > log2FoldChange_off_scale
graph_data.loc[filt, 'comparison2_log2FoldChange'] = log2FoldChange_off_scale
graph_data.loc[filt, 'Off_scale'] = True

filt = graph_data['comparison2_log2FoldChange'] < -log2FoldChange_off_scale
graph_data.loc[filt, 'comparison2_log2FoldChange'] = -log2FoldChange_off_scale
graph_data.loc[filt, 'Off_scale'] = True


graph_data = graph_data.reset_index(drop=True) #The needs doing

In [None]:
# Make output directory
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [None]:
if graph_data['Off_scale'].sum() > 0:   # Prevents error
    markers = ['o', '*']
else:
    markers = ['o']


colors = ["blue", "red", 'grey']
sns.set_palette(sns.color_palette(colors))

sns.scatterplot(data=graph_data, 
                x="comparison1_log2FoldChange", 
                y="comparison2_log2FoldChange", 
                hue="Change",
                style="Off_scale",
                markers=markers,
                s=7,
                edgecolor = None
               )
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)


plt.xlabel('log2fc(Competent vs not competent)')
plt.ylabel('log2fc(Treated vs not treated)')

#plt.title(comparison)

outfile = f'{outdir}/lfc_vs_lfc_plot'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)
#plt.clf()

In [None]:
# Add annotations show genes not showing a consistet change

In [None]:
# Print the name of the genes with most significant changes
#graph_data['graph_weighting'] = (graph_data['comparison1_log2FoldChange']**2) + (graph_data['comparison2_log2FoldChange']**2)

graph_data['graph_weighting'] = abs(graph_data['comparison1_log2FoldChange'] - graph_data['comparison2_log2FoldChange'])
graph_data['graph_weighting'] = graph_data['graph_weighting'].rank(ascending=False)

In [None]:
# Use the look-up file, if provided
if gene_lookup_file is not None:
    gene_lookups = gene_lookups.rename(mapper={'gene_id' : 'region', 'gene_name' : 'gene'}, axis=1)
    graph_data = pd.merge(graph_data, gene_lookups, on='region', how='left')
else:
    graph_data = graph_data.rename(mapper={'region' : 'gene'})

In [None]:
number_annotations = 35

if graph_data['Off_scale'].sum() > 0:   # Prevents error
    markers = ['o', '*']
else:
    markers = ['o']


colors = ["blue", "red", 'grey']
sns.set_palette(sns.color_palette(colors))

sns.scatterplot(data=graph_data, 
                x="comparison1_log2FoldChange", 
                y="comparison2_log2FoldChange", 
                hue="Change",
                style="Off_scale",
                markers=markers,
                s=7,
                edgecolor = None
               )

for i in range(graph_data.shape[0]):
    if graph_data.loc[i, 'graph_weighting'] <= number_annotations:
        if graph_data.loc[i, 'Change'] == 'NOT_CONSISTENT':
            plt.text(x=graph_data.loc[i, 'comparison1_log2FoldChange'] + 0.1,
                     y=graph_data.loc[i, 'comparison2_log2FoldChange'] + 0.1,
                     s=graph_data.loc[i, 'gene'], 
                     fontsize=6
                     #fontdict=dict(color='red',size=10),
                     #bbox=dict(facecolor='yellow',alpha=0.5)
                    )


plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)


plt.xlabel('log2fc(Competent vs not competent)')
plt.ylabel('log2fc(Treated vs not treated)')

#plt.title(comparison)

outfile = f'{outdir}/lfc_vs_lfc_plot_annotated'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

In [None]:
print('Done')

In [None]:
graph_data.query('Change == "NOT_CONSISTENT"')