In [None]:
import pandas as pd
import pickle
import tiledbsoma as soma
import tiledb 
from dotenv import load_dotenv
import os

# Load Data

In [None]:
emb_df = pd.read_csv('/home/ubuntu/bascvi/data/embeddings/v6.0.5/1000_disc_1.0_kl/scvi-vae-epoch=29-elbo_val=0.00.ckpt_predictions.tsv', sep='\t')
emb_df

downsampled_embs_df = emb_df.sample(n=500000, random_state=42)
downsampled_embs_df

downsampled_embs_df['soma_joinid'] = downsampled_embs_df['soma_joinid'].astype(int)

In [None]:
load_dotenv("/home/ubuntu/.aws.env")

ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_KEY = os.getenv("SECRET_KEY")


soma_experiment = soma.Experiment.open("s3://pai-scrnaseq/sctx_gui/corpora/human_v6.0.5/", context=soma.SOMATileDBContext(tiledb_ctx=tiledb.Ctx({
        "vfs.s3.aws_access_key_id": ACCESS_KEY,
        "vfs.s3.aws_secret_access_key": SECRET_KEY,
        "vfs.s3.region": "us-east-2"
    })))

In [None]:


obs_df_6_0_5 = soma_experiment.obs.read(coords=(downsampled_embs_df.soma_joinid.values.tolist(), ), ).concat().to_pandas()

barcodes = obs_df_6_0_5.barcode.values.tolist()

downsampled_embs_df = downsampled_embs_df.merge(obs_df_6_0_5, on='soma_joinid')

downsampled_embs_df


# soma_experiment = soma.Experiment.open("s3://pai-scrnaseq/sctx_gui/corpora/human_v6.0.4/", context=soma.SOMATileDBContext(tiledb_ctx=tiledb.Ctx({
#         "vfs.s3.aws_access_key_id": ACCESS_KEY,
#         "vfs.s3.aws_secret_access_key": SECRET_KEY,
#         "vfs.s3.region": "us-east-2"
#     })))

# obs_df_6_0_4 = soma_experiment.obs.read(column_names=("soma_joinid", "barcode", )).concat().to_pandas() 


# obs_df_6_0_4 = soma_experiment.obs.read(coords=(obs_df_6_0_4[obs_df_6_0_4.barcode.isin(barcodes)].soma_joinid.values.tolist(), ), column_names=("soma_joinid", "standard_true_celltype", "cells_or_nuclei", "study_name", "disease_name", "barcode", "biopsy_site", )).concat().to_pandas() 
# obs_df_6_0_4

# merged_df = obs_df_6_0_4.merge(downsampled_embs_df, on='barcode')
# merged_df

In [None]:
import anndata as ad
with soma_experiment.axis_query(
        measurement_name="RNA", obs_query=soma.AxisQuery(coords=(umap_df.index.to_list(),))
    ) as query:
        adata: ad.AnnData = query.to_anndata(
            X_name=f"row_raw",
            column_names={"obs": ["soma_joinid"], "var": ["soma_joinid", "gene"]},
        )
adata

In [None]:
o

In [None]:
adata.X.sum(axis=1)

In [None]:
adata.var.gene

In [None]:
import scanpy as sc
adata.var["mito"] = adata.var.gene.str.startswith("MT-")
adata.raw = adata
sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True, use_raw=True)


In [None]:
qc_df = adata.obs

In [None]:
qc_df

In [None]:
qc_df.set_index("soma_joinid", inplace=True)

In [None]:
umap_df = umap_df.merge(qc_df, left_index=True, right_index=True)

# UMAP

In [None]:
import umap

# Extract the embedding dimensions from the merged dataframe
embedding_dimensions = downsampled_embs_df.loc[:, ['embedding_' + str(i) for i in range(10)]].values

# Run UMAP on the embedding dimensions
umap_result = umap.UMAP().fit_transform(embedding_dimensions)

# Print the UMAP result
print(umap_result)

In [None]:
umap_df = pd.read_csv("/home/ubuntu/bascvi/data/embeddings/v6.0.5/1000_disc_1.0_kl/umap_df.csv")
umap_df.set_index('Unnamed: 0', inplace=True)

In [None]:
umap_df

In [None]:
umap_df.columns

In [None]:
# plot histogram of total_count
umap_df.log1p_total_counts.plot.hist()

In [None]:
umap_df.to_csv("/home/ubuntu/bascvi/data/embeddings/v6.0.5/1000_disc_1.0_kl/umap_df_qc.csv")

In [None]:
# show pct mito histogram for each scrnaseq_protocol
umap_df.groupby('scrnaseq_protocol').pct_counts_mito.hist(bins=100, alpha=0.5, legend=True)

In [None]:
import plotly.express as px


# Plot UMAP using Plotly
fig = px.scatter(umap_df, x='UMAP1', y='UMAP2', color='authors_celltype', hover_data=['barcode', 'standard_true_celltype', 'disease_name'])
fig.show()

In [None]:
import plotly.express as px

# Create a DataFrame for UMAP result
umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])

downsampled_embs_df.loc[downsampled_embs_df['standard_true_celltype'] == '', 'standard_true_celltype'] = "None"

# Add the merged_df columns to the UMAP DataFrame
umap_df = umap_df.merge(downsampled_embs_df, left_index=True, right_index=True)

# only keep cells
umap_df = umap_df[umap_df.cells_or_nuclei == 'cells']

print(umap_df['standard_true_celltype'].value_counts())

# Plot UMAP using Plotly
fig = px.scatter(umap_df, x='UMAP1', y='UMAP2', color='enrichment_bool', hover_data=['barcode', 'standard_true_celltype', 'disease_name'])
fig.show()

In [None]:
# save uamp df as csv
umap_df.to_csv('/home/ubuntu/bascvi/data/embeddings/v6.0.5/1000_disc_1.0_kl/umap_df.csv', index=False)

In [None]:
import plotly.io as pio

# Create the Plotly figure
fig1 = px.scatter(umap_df, x='UMAP1', y='UMAP2', color='study_name', hover_data=['barcode', 'standard_true_celltype', 'disease_name', 'biopsy_site', 'study_name'], opacity=0.8)
fig2 = px.scatter(umap_df[umap_df.standard_true_celltype != 'None'], x='UMAP1', y='UMAP2', color='standard_true_celltype', hover_data=['barcode', 'standard_true_celltype', 'disease_name', 'biopsy_site', 'study_name'], opacity=0.8)
fig3 = px.scatter(umap_df, x='UMAP1', y='UMAP2', color='disease_name', hover_data=['barcode', 'standard_true_celltype', 'disease_name', 'biopsy_site', 'study_name'], opacity=0.8)
fig4 = px.scatter(umap_df, x='UMAP1', y='UMAP2', color='biopsy_site', hover_data=['barcode', 'standard_true_celltype', 'disease_name', 'biopsy_site', 'study_name'], opacity=0.8)


# Save the figure as an HTML file
pio.write_html(fig1, 'v6_1000_500k_study_name.html')
pio.write_html(fig2, 'v6_1000_500k_standard_true_celltype.html')
pio.write_html(fig3, 'v6_1000_500k_disease_name.html')
pio.write_html(fig4, 'v6_1000_500k_biopsy_site.html')
