# Analyse VST results

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
import math

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

import plotly.express as px
from sklearn.decomposition import PCA

In [None]:
# Setup
vst_data_file = 'vst_qc/vst_normalised_data.tsv'


palette_to_choose = 'colorblind'
palette_to_choose = sns.color_palette(palette_to_choose).as_hex()

colorDict = {
                'Burb' : palette_to_choose[0],
                'Fiaj' : palette_to_choose[1],
                'Hehd' : palette_to_choose[2],
                'Sojd' : palette_to_choose[3],
                'H1' : palette_to_choose[4],
                'H9' : palette_to_choose[5],
                'HUES8' : palette_to_choose[6],
                'Kolf' : palette_to_choose[7],
}

ntop = 500  # Number of genes to use in PCA

cell_line_delimiter = '_'    # e.g. celline-number (delimiter here is -)

outdir = 'datset_similarity'
image_formats = ('png', 'svg', 'eps')

In [None]:
palette_to_choose

In [None]:
#Read in data 
print('Reading in: ' + vst_data_file)
vst_data = pd.read_csv(vst_data_file, sep='\t')
#seqmonk_report = pd.read_csv(seqmonk_report_file, sep='\t', dtype={'Chromosome': 'string'}, nrows=10000)    # First 10,000 rows only!

print(f'{vst_data.shape[1] - 1} samples with {vst_data.shape[0]} quantitated regions')



In [None]:
vst_data

In [None]:
vst_reps_combined = vst_data.copy()
column_names = vst_reps_combined.columns
column_names.str.replace(pat='_CR_', repl='-CHR_')
vst_reps_combined.columns = column_names
vst_reps_combined

In [None]:
vst_reps_combined = vst_data.copy()

# Edit the column names so splitting on _ of reset still works
column_names = vst_reps_combined.columns
column_names = column_names.str.replace(pat='_CR_', repl='-CHR_')
vst_reps_combined.columns = column_names
vst_reps_combined

In [None]:
# Make a dataset with the replicates combined
vst_reps_combined = vst_data.copy()

# Edit the column names so splitting on _ of reset still works
column_names = vst_reps_combined.columns
column_names = column_names.str.replace(pat='_CR_', repl='-CHR_')
vst_reps_combined.columns = column_names
vst_reps_combined



In [None]:
# Make a dataset with the replicates combined
vst_reps_combined = vst_data.copy()

# Edit the column names so splitting on _ of reset still works
column_names = vst_reps_combined.columns
column_names = column_names.str.replace(pat='_CR_', repl='-CHR_')
vst_reps_combined.columns = column_names

# Melt the data
vst_reps_combined = pd.melt(vst_reps_combined, id_vars='gene_id', var_name='Sample', value_name='Expression')
vst_reps_combined['Sample'] = vst_reps_combined['Sample'].str.split('_', expand=True)[0]

# Create a groupby object
vst_reps_combined_grouped = vst_reps_combined.groupby(by=['gene_id', 'Sample'])

# Detrmine the mean values
vst_reps_combined = vst_reps_combined_grouped.mean().reset_index()

# Pivot the data
vst_reps_combined = vst_reps_combined.pivot(index='gene_id', columns='Sample', values='Expression')
vst_reps_combined = vst_reps_combined.reset_index()
vst_reps_combined.columns.name = None
del(vst_reps_combined_grouped)

In [None]:
vst_reps_combined

## Separate replicates

In [None]:
# Select the most variable genes to use for PCA etc
vst_data['variance'] = vst_data.iloc[:, 1:].var(axis=1)

vst_data = (vst_data
            .sort_values(by='variance', ascending=False, axis=0)
            .head(ntop)
            .drop('variance', axis=1)
           )

In [None]:
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [None]:
# Correlation heatmap
pearson_matrix = vst_data.iloc[:, 1:].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(pearson_matrix, annot=True);
plt.title('Correlation heatmap of datasets')

#plt.savefig('correlation_heatmap_before_filtering.svg', bbox_inches='tight')

outfile = f'{outdir}/seprate_reps_correlation_matrix_plot'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

In [None]:
# Make dendrogram
plt.figure(figsize=(10, 10))

pearson_matrix.index = pearson_matrix.columns
#matrix
dissimilarity = 1 - abs(pearson_matrix)

Z = linkage(squareform(dissimilarity), 'average')

dendrogram(Z, labels=pearson_matrix.index, orientation='left', color_threshold=0, above_threshold_color='black')

ax = plt.gca()
y_labels = ax.get_ymajorticklabels()
for y in y_labels:
    color_to_lookup = y.get_text().split(cell_line_delimiter)[0] 
    y.set_color(colorDict[color_to_lookup])

outfile = f'{outdir}/seprate_reps_similarity_tree'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

In [None]:
# PCA
pca_data = vst_data.iloc[:, 1:].transpose()

cell_line_labels = (pca_data
          .index
          .to_series()
          .str.split(cell_line_delimiter, expand=True)[0]
          .reset_index(drop=True)
         )

treated_labels = (pca_data
          .index
          .to_series()
          .str.split(cell_line_delimiter, expand=True)[1]
          .reset_index(drop=True)
         )
treated_labels = pd.Series(np.where(treated_labels == 'CHR2', 'Treated', 'Untreated'))

color_discrete_sequence = []
for cell_line in cell_line_labels.drop_duplicates():    # Using my custom cell line colour scheme
    color_to_select = colorDict[cell_line]
    color_discrete_sequence.append(color_to_select)

pca = PCA()
components = pca.fit_transform(pca_data)
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=cell_line_labels,
    width=800, 
    height=800,
    symbol=treated_labels,
    color_discrete_sequence=color_discrete_sequence,
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
#plt.figure(figsize=(8, 8))

pcx = 1
pcy = 2
marker_size=200
markers = ['o', '^']


# Format data for graph
pc_variance_x = round(pca.explained_variance_ratio_[pcx - 1] * 100, 1)
pc_variance_y = round(pca.explained_variance_ratio_[pcy - 1] * 100, 1)

pcx = f'PC{pcx}'
pcy = f'PC{pcy}'

x_axis_label = f'{pcx} ({pc_variance_x}%)'
y_axis_label = f'{pcy} ({pc_variance_y}%)'


scatter_plot_data = pd.DataFrame(components)

column_names = []
for i in range(1, scatter_plot_data.shape[1] + 1):
    column_names.append(f'PC{i}')
scatter_plot_data.columns = column_names    

scatter_plot_data['Sample'] = pca_data.index

scatter_plot_data['Cell_Line'] = scatter_plot_data['Sample'].str.split(cell_line_delimiter, expand=True)[0]
#scatter_plot_data['Treated'] = scatter_plot_data['Sample'].str.split(cell_line_delimiter, expand=True)[1]
scatter_plot_data['Treated'] = scatter_plot_data['Sample'].str.contains(pat='_CR_')


scatter_plot_data = pd.concat([scatter_plot_data.iloc[:, -3:], scatter_plot_data.iloc[:, :-3]], axis=1)    # Re-order columns   

# Plot graph
custom_palette = []
for cell_line in scatter_plot_data.loc[:, 'Cell_Line'].drop_duplicates():
    custom_palette.append(colorDict[cell_line])

sns.scatterplot(data=scatter_plot_data, x=pcx, y=pcy, 
                hue='Cell_Line', s=marker_size, style='Treated', 
                edgecolor = None, markers=markers,
                palette=custom_palette)
plt.xlabel(x_axis_label)
plt.ylabel(y_axis_label)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

outfile = f'{outdir}/seprate_reps_pca'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

In [None]:
# Scree plot
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_ * 100, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained (%)')

outfile = f'{outdir}/seprate_reps_scree_plot'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

## Combined replicates

In [None]:
# Now make the vst data the combined dataset !
vst_data = vst_reps_combined.copy()

In [None]:
# Select the most variable genes to use for PCA etc
vst_data['variance'] = vst_data.iloc[:, 1:].var(axis=1)

vst_data = (vst_data
            .sort_values(by='variance', ascending=False, axis=0)
            .head(ntop)
            .drop('variance', axis=1)
           )

In [None]:
# Correlation heatmap
pearson_matrix = vst_data.iloc[:, 1:].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(pearson_matrix, annot=True);
plt.title('Correlation heatmap of datasets')

#plt.savefig('correlation_heatmap_before_filtering.svg', bbox_inches='tight')

outfile = f'{outdir}/combined_reps_correlation_matrix_plot'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

In [None]:
# Make dendrogram
plt.figure(figsize=(10, 10))

pearson_matrix.index = pearson_matrix.columns
#matrix
dissimilarity = 1 - abs(pearson_matrix)

Z = linkage(squareform(dissimilarity), 'average')

dendrogram(Z, labels=pearson_matrix.index, orientation='left', color_threshold=0, above_threshold_color='black')

ax = plt.gca()
y_labels = ax.get_ymajorticklabels()
for y in y_labels:
    color_to_lookup = y.get_text().split(cell_line_delimiter)[0].split('-')[0]
    y.set_color(colorDict[color_to_lookup])

outfile = f'{outdir}/combined_reps_similarity_tree'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

In [None]:
# PCA
pca_data = vst_data.iloc[:, 1:].transpose()

cell_line_labels = (pca_data
                    .index
                    .to_series()
                    .str.split('-', expand=True)[0]
                    .reset_index(drop=True)
         )

treated_labels = (pca_data
                  .index
                  .to_series()
                  .str.split('-', expand=True)[1]
                  .reset_index(drop=True)
         )

treated_labels = pd.Series(np.where(treated_labels == 'CHR2', 'Treated', 'Untreated'))

color_discrete_sequence = []
for cell_line in cell_line_labels.drop_duplicates():    # Using my custom cell line colour scheme
    color_to_select = colorDict[cell_line]
    color_discrete_sequence.append(color_to_select)

pca = PCA()
components = pca.fit_transform(pca_data)
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=cell_line_labels,
    width=800, 
    height=800,
    symbol=treated_labels,
    color_discrete_sequence=color_discrete_sequence,
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
#plt.figure(figsize=(8, 8))

pcx = 1
pcy = 2
marker_size=200
markers = ['o', '^']


# Format data for graph
pc_variance_x = round(pca.explained_variance_ratio_[pcx - 1] * 100, 1)
pc_variance_y = round(pca.explained_variance_ratio_[pcy - 1] * 100, 1)

pcx = f'PC{pcx}'
pcy = f'PC{pcy}'

x_axis_label = f'{pcx} ({pc_variance_x}%)'
y_axis_label = f'{pcy} ({pc_variance_y}%)'


scatter_plot_data = pd.DataFrame(components)

column_names = []
for i in range(1, scatter_plot_data.shape[1] + 1):
    column_names.append(f'PC{i}')
scatter_plot_data.columns = column_names    

scatter_plot_data['Sample'] = pca_data.index

scatter_plot_data['Cell_Line'] = scatter_plot_data['Sample'].str.split('-', expand=True)[0]
#scatter_plot_data['Treated'] = scatter_plot_data['Sample'].str.split(cell_line_delimiter, expand=True)[1]
scatter_plot_data['Treated'] = scatter_plot_data['Sample'].str.contains(pat='-CHR')


scatter_plot_data = pd.concat([scatter_plot_data.iloc[:, -3:], scatter_plot_data.iloc[:, :-3]], axis=1)    # Re-order columns   

# Plot graph
custom_palette = []
for cell_line in scatter_plot_data.loc[:, 'Cell_Line'].drop_duplicates():
    custom_palette.append(colorDict[cell_line])

sns.scatterplot(data=scatter_plot_data, x=pcx, y=pcy, 
                hue='Cell_Line', s=marker_size, style='Treated', 
                edgecolor = None, markers=markers,
                palette=custom_palette)

plt.xlabel(x_axis_label)
plt.ylabel(y_axis_label)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

outfile = f'{outdir}/combined_reps_pca'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

In [None]:
# Scree plot
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_ * 100, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained (%)')

outfile = f'{outdir}/combined_reps_scree_plot'
for image_format in image_formats:
    plt.savefig(fname=f'{outfile}.{image_format}', bbox_inches='tight', pad_inches=0.5)

plt.show()

In [None]:
print('Done')