This notebook is intended to be run on the output of the 4i pipeline hosted at https://github.com/PurvisLabTeam/4i_pipeline

It produces a single Anndata object, with user defined metadata, for downstream processing.

In [None]:
import pickle
import os
import numpy as np
print("PYTHONPATH:", os.environ.get('PYTHONPATH'))
print("PATH:", os.environ.get('PATH'))
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import pandas as pd
import phate
import math
import random
import gc
import scprep
from datetime import datetime, time
from matplotlib.animation import ImageMagickWriter
import matplotlib.animation as animation
import zipfile
from urllib.request import urlopen
import scipy.stats as st
from scipy.stats import norm
from scipy.stats import gaussian_kde
from scipy.stats import kde
from scipy.stats import binned_statistic
from scipy.stats import f_oneway
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.rcParams['pdf.fonttype'] = 42
print(sns.__version__)
from anndata import AnnData
import scanpy as sc
from delve import *
import anndata as ad
from sklearn.preprocessing import MinMaxScaler
from kh import sketch
from sklearn.cluster import KMeans
import umap
print(sc.__version__)
today = datetime.now().strftime("%m%d%Y-%H%M")

In [2]:
### full_dir should be the directory that contains the output of your "calculate cell properties" notebook. Called cell_data by default
### well_list should be every well in the dataset that you intend to combine into a single adata object for analysis

full_dir = r'your/path/here.csv'

well_list = ['well_designation']

In [3]:
# Definition to Normalize the dataframe by z-score

def standardizeColumns(df):
    df = df.copy()
    df.iloc[:,:] = df.iloc[:,:].apply(lambda x: (x-x.mean())/ x.std(), axis=0)
    return df

In [None]:
import os
import pandas as pd

# Create an empty DataFrame to concatenate all data
fullest_df = pd.DataFrame()

# Dictionary to store DataFrames for each well
well_dataframes = {}

for well in well_list:
    print(f'starting Well {well}')
    full_df = pd.read_csv(os.path.join(full_dir, f'cell_data_{well}_df.csv'), sep=',') 

    # Add important metadata information
    if ("B" or "C") and "2" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'control' 
        full_df['sample_id'] = 1
    if ("B" or "C") and "3" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'timepoint 1' 
        full_df['sample_id'] = 2
    if ("B" or "C") and "4" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'timepoint 2' 
        full_df['sample_id'] = 3
    if ("B" or "C") and "5" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'timepoint 3' 
        full_df['sample_id'] = 4
    if ("B" or "C") and "6" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'timepoint 4' 
        full_df['sample_id'] = 5
    if ("B" or "C") and "7" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'timepoint 5' 
        full_df['sample_id'] = 6
    if ("B" or "C") and "8" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'timepoint 6' 
        full_df['sample_id'] = 7
    if ("B" or "C") and "9" in well:
        full_df['treatment'] = 'etop' 
        full_df['Group'] = 'timepoint 7' 
        full_df['sample_id'] = 8
    if ("D" or "E") and "2" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'control' 
        full_df['sample_id'] = 9
    if ("D" or "E") and "3" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'timepoint 1' 
        full_df['sample_id'] = 10
    if ("D" or "E") and "4" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'timepoint 2' 
        full_df['sample_id'] = 11
    if ("D" or "E") and "5" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'timepoint 3' 
        full_df['sample_id'] = 12
    if ("D" or "E") and "6" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'timepoint 4' 
        full_df['sample_id'] = 13
    if ("D" or "E") and "7" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'timepoint 5' 
        full_df['sample_id'] = 14
    if ("D" or "E") and "8" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'timepoint 6' 
        full_df['sample_id'] = 15
    if ("D" or "E") and "9" in well:
        full_df['treatment'] = 'paracrine' 
        full_df['Group'] = 'timepoint 7' 
        full_df['sample_id'] = 16

    # Store the DataFrame in the dictionary
    well_dataframes[well] = full_df.copy()
    fullest_df = pd.concat([fullest_df, full_df], ignore_index=True)
    print(len(full_df))
    print(len(fullest_df))

# Output the length of the DataFrame for each well
for well, df in well_dataframes.items():
    print(f'Well {well} has {len(df)} cells')

fullest_df['sample_ID'] = fullest_df['sample_ID'].astype('category')


In [6]:
### It is good practice to save these dataframes at key steps, such as this one
fullest_df.to_csv(r'your/save/path/here.csv')

In [None]:
fullest_df.columns

In [5]:
# Drop columns that you don't need - Preparation for conversion to AnnData object
fullest_df = fullest_df.drop(columns=["Unnamed: 0", "bbox-0", "bbox-1", "bbox-2","bbox-3", "orientation", "nuc_mask", "ring_mask"])
# Extract metadata columns and store them in a separate dataframe
metadata = fullest_df[["label", "well", "treatment", "Group", "sample_id"]]
# Remove metadata columns from the main dataframe
fullest_df = fullest_df.drop(columns=["label", "well", "treatment", "Group", "sample_id"])

In [6]:
#Z normalize the data
standard_df = standardizeColumns(fullest_df)

In [7]:
#min-max normalize the data - Not used in this example, provided for potential use
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(fullest_df)
normalized_data = pd.DataFrame(normalized_data, index = fullest_df.index, columns = fullest_df.columns)

In [None]:
# Convert the pandas dataframe to an anndata object
fullest_adata = ad.AnnData(fullest_df)
# Add metadata back to the anndata object
fullest_adata.obs = metadata.copy()
fullest_adata.obs_names = [f'c_{i}' for i in fullest_adata.obs_names]

#Save the entire adata file
adata_save_path = r'your/save/path/here.h5ad'
fullest_adata.write_h5ad(adata_save_path)

In [None]:
# Convert the pandas dataframe to an anndata object
standard_adata = ad.AnnData(standard_df)
# Add metadata back to the anndata object
standard_adata.obs = metadata.copy()
standard_adata.obs_names = [f'c_{i}' for i in standard_adata.obs_names]

#Save the entire adata file
adata_save_path = r'your/save/path/here.h5ad.h5ad'
standard_adata.write_h5ad(adata_save_path)

In [None]:
###Sketching lets your subsample your data accurately. 
### This example groups for subsampling based on the sample_id metadata

idx, standard_adata_sub = sketch(standard_adata, num_subsamples = 1200, frequency_seed = 42, sample_set_key = 'sample_ID')
#Save the entire adata file
adata_save_path = r'my\save\path\standard_adata_sub_sub.h5ad'
standard_adata_sub.write_h5ad(adata_save_path)

Subsequent notebooks will load in the saved adata file of your choice, either the full dataset of the subsampled dataset.
Additionally, you can produce any desired normalized or non-normalized adata file here. Z-score normalized is the provided example, but other methods can be used with minor modifications. i.e. min-max normalization using the provided code.