This notebook produces Violin plots by feature and timepoint using the Anndata object produced by the adata_conversion notebook

In [None]:
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import pandas as pd
import phate
import math
import random
import gc
import scprep
from datetime import datetime, time
from matplotlib.animation import ImageMagickWriter
import matplotlib.animation as animation
import zipfile
from urllib.request import urlopen
import scipy.stats as st
from scipy.stats import norm
from scipy.stats import gaussian_kde
from scipy.stats import kde
from scipy.stats import binned_statistic
from scipy.stats import f_oneway
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.rcParams['pdf.fonttype'] = 42
print(sns.__version__)
from anndata import AnnData
import scanpy as sc
from delve import *
import anndata as ad
from sklearn.preprocessing import MinMaxScaler
from kh import sketch
from sklearn.cluster import KMeans
import umap
print(sc.__version__)
today = datetime.now().strftime("%m%d%Y-%H%M")

In [None]:
#Read back in the subsampled adata file
adata_save_path = r'your/save/path.here.h5ad'
standard_trimmed_noPSTAT5_adata_sub = anndata.read_h5ad(adata_save_path)

In [4]:
def laplacian_score_fs(adata = None,
                    k: int  = None,
                    n_jobs: int  = -1):

    X, feature_names, obs_names = parse_input(adata)
    W = construct_affinity(X = X, k = k, n_jobs = n_jobs)
    scores = laplacian_score(X = X, W = W)
    predicted_features = pd.DataFrame(scores, index = feature_names, columns = ['laplacian_score'])
    predicted_features = predicted_features.sort_values(by = 'laplacian_score', ascending = True)

    return predicted_features 

In [5]:
l_score_standard = laplacian_score_fs(standard_trimmed_noPSTAT5_adata_sub, k = 100)

In [None]:
len(l_score_standard)

In [None]:
l_score_standard.index[:46]

In [None]:
# Extract relevant columns from adata_sub.obs
sample_ids = standard_trimmed_noPSTAT5_adata_sub.obs['sample_ID']
treatment = standard_trimmed_noPSTAT5_adata_sub.obs['treatment']

protein_list = (l_score_standard.index[:84])
for protein_name in protein_list:

    # Get the index of the protein in adata_sub.X
    protein_index = np.where(standard_trimmed_noPSTAT5_adata_sub.var_names == protein_name)[0][0]

    # Specify the sample IDs you want to include
    selected_sample_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14]

    # Initialize the data arrays for violin plots and corresponding treatments
    data = []
    treatments = []

    # Generate data and corresponding treatments for each sample ID
    for sample_id in selected_sample_ids:
        # Filter data for the current sample ID
        sample_data = standard_trimmed_noPSTAT5_adata_sub.X[sample_ids == sample_id, protein_index]

        # Filter treatment for the current sample ID
        sample_treatment = treatment[sample_ids == sample_id][0]

        # Check if sample_data is empty
        if len(sample_data) > 0:
            # Store the sample data and treatment
            data.append(sample_data)
            treatments.append(sample_treatment)

    # Create violin plots if there is data available
    if data:
        plt.figure(figsize=(12, 4))  # Set the size of the plot
        plt.violinplot(data, showmeans=False, showmedians=True, showextrema=False)  # Remove vertical lines

        # Add labels and legend
        plt.xlabel('Etoposide Exposure Duration')
        plt.ylabel('Z Normalized Intensity')
        plt.xticks(rotation=45)  # Rotate x-axis labels by 45 degrees
        plt.xticks(np.arange(1, len(selected_sample_ids) + 1), treatments)
        plt.title(protein_name)

        # Set custom y-axis limits
        plt.ylim(bottom=np.percentile(np.concatenate(data), 1), top=np.percentile(np.concatenate(data), 95))  # Adjust the percentile as needed

        # Show the plot
        plt.show()


Outputting CSV for import into graphpad

In [None]:
import os
import pandas as pd

# Define the folder where you want to save the CSV files
output_folder = r'C:\Users\gases\Desktop\Representative_Images\Fig2_CSVs'
os.makedirs(output_folder, exist_ok=True)

# Extract relevant columns from adata_sub.obs
sample_ids = standard_trimmed_noPSTAT5_adata_sub.obs['sample_ID']
treatment = standard_trimmed_noPSTAT5_adata_sub.obs['treatment']

protein_list = l_score_standard.index[:84]
for protein_name in protein_list:

    # Get the index of the protein in adata_sub.X
    protein_index = np.where(standard_trimmed_noPSTAT5_adata_sub.var_names == protein_name)[0][0]

    # Specify the sample IDs you want to include
    selected_sample_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14]

    # Initialize a dictionary to store data for the CSV
    data_dict = {treat: [] for treat in treatment.unique()}

    # Generate data and corresponding treatments for each sample ID
    for sample_id in selected_sample_ids:
        # Filter data for the current sample ID
        sample_data = standard_trimmed_noPSTAT5_adata_sub.X[sample_ids == sample_id, protein_index]

        # Filter treatment for the current sample ID
        sample_treatment = treatment[sample_ids == sample_id][0]

        # Check if sample_data is empty
        if len(sample_data) > 0:
            # Store the sample data under the corresponding treatment
            data_dict[sample_treatment].extend(sample_data)

    # Create a DataFrame from the dictionary, handling varying lengths of data
    df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))

    # Save the DataFrame as a CSV file
    csv_file_path = os.path.join(output_folder, f"{protein_name}.csv")
    df.to_csv(csv_file_path, index=False)

    # Optionally, print a message for confirmation
    print(f"Saved {protein_name}.csv to {output_folder}")


Save Violin Plots to a Folder

In [13]:
import os
import numpy as np
import matplotlib.pyplot as plt

# Define the folder where you want to save the PNG files
output_folder = r'your/save/folder/path/here'  # Replace with your desired folder path

# Ensure the folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Extract relevant columns from adata_sub.obs
sample_ids = standard_trimmed_noPSTAT5_adata_sub.obs['sample_ID']
treatment = standard_trimmed_noPSTAT5_adata_sub.obs['treatment']

protein_list = l_score_standard.index[:84]
for protein_name in protein_list:

    # Get the index of the protein in adata_sub.X
    protein_index = np.where(standard_trimmed_noPSTAT5_adata_sub.var_names == protein_name)[0][0]

    # Specify the sample IDs you want to include
    selected_sample_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14]

    # Initialize the data arrays for violin plots and corresponding treatments
    data = []
    treatments = []

    # Generate data and corresponding treatments for each sample ID
    for sample_id in selected_sample_ids:
        # Filter data for the current sample ID
        sample_data = standard_trimmed_noPSTAT5_adata_sub.X[sample_ids == sample_id, protein_index]

        # Filter treatment for the current sample ID
        sample_treatment = treatment[sample_ids == sample_id][0]

        # Check if sample_data is empty
        if len(sample_data) > 0:
            # Store the sample data and treatment
            data.append(sample_data)
            treatments.append(sample_treatment)

    # Create violin plots if there is data available
    if data:
        plt.figure(figsize=(12, 4))  # Set the size of the plot
        plt.violinplot(data, showmeans=False, showmedians=True, showextrema=False)  # Remove vertical lines

        # Add labels and legend
        plt.xlabel('Etoposide Exposure Duration')
        plt.ylabel('Z Normalized Intensity')
        plt.xticks(rotation=45)  # Rotate x-axis labels by 45 degrees
        plt.xticks(np.arange(1, len(selected_sample_ids) + 1), treatments)
        plt.title(protein_name)

        # Set custom y-axis limits
        plt.ylim(bottom=np.percentile(np.concatenate(data), 1), top=np.percentile(np.concatenate(data), 95))  # Adjust the percentile as needed

        # Save the plot as a PNG file
        output_path = os.path.join(output_folder, f'{protein_name}.png')
        plt.savefig(output_path, format='png', dpi=300)  # Save as high-quality PNG

        # Close the plot to free memory
        plt.close()
