In [1]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import mygene

In [2]:
sc.settings.verbosity = 3
fig_res = 150
sc.set_figure_params(dpi = fig_res, frameon = 0)

In [3]:
ext_obj = sc.read('S:/data cache/code_in_out/external_datasets/gim and eim unified umap of all file.h5ad')

In [4]:
str(ext_obj)

"AnnData object with n_obs × n_vars = 146583 × 33145\n    obs: 'Sample', 'Tissue_in_paper', 'Batch', 'Sample_Barcode', 'sum', 'detected', 'Study', 'Patient_type', 'Patient_status', 'MT.prop', 'sizeFactor', 'Global_cluster_selected', 'Celltypes_global', 'Tissuetypes_global', 'Detailed_Cell_Type', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'tissue_ontology_term_id', 'disease_ontology_term_id', 'sex_ontology_term_id', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'donor_id', 'suspension_type', 'development_stage_ontology_term_id', 'is_primary_data', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'\n    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length'\n    uns: 'X_name', 'citation', 'schema_reference', 'schema_version', 'title'\n    obsm: 'X_umap_MinDist_0.01_N_Neighbors_15', 'X_umap_MinDist_0.05_N_Neigh

In [5]:
# Remove version numbers from Ensembl IDs
ext_obj.var['ensembl_id'] = ext_obj.var_names.str.replace(r'/./d+$', '', regex=True)

In [6]:
# Map Ensembl IDs to gene symbols
mg = mygene.MyGeneInfo()
gene_info = mg.querymany(
    ext_obj.var['ensembl_id'],
    scopes='ensembl.gene',
    fields='symbol',
    species='human'
)

4 input query terms found dup hits:	[('ENSG00000249738', 2), ('ENSG00000278903', 3), ('ENSG00000188660', 2), ('ENSG00000268674', 3)]
104 input query terms found no hit:	['ENSG00000230699', 'ENSG00000241180', 'ENSG00000226849', 'ENSG00000272482', 'ENSG00000264443', 'ENS


In [7]:
# Convert query results to DataFrame and handle duplicates
gene_info_df = pd.DataFrame(gene_info)
gene_info_df.drop_duplicates(subset='query', inplace=True)

In [8]:
# Create a mapping dictionary from Ensembl IDs to gene symbols
ens_to_symbol = pd.Series(
    gene_info_df['symbol'].values,
    index=gene_info_df['query']
).to_dict()

In [9]:
# Map Ensembl IDs in fibros_copy to gene symbols
ext_obj.var['gene_symbol'] = ext_obj.var['ensembl_id'].map(ens_to_symbol)

In [10]:
# Replace missing symbols with Ensembl IDs
ext_obj.var['gene_symbol'].fillna(ext_obj.var['ensembl_id'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ext_obj.var['gene_symbol'].fillna(ext_obj.var['ensembl_id'], inplace=True)


In [11]:
# Ensure gene symbols are strings
ext_obj.var['gene_symbol'] = ext_obj.var['gene_symbol'].astype(str)

In [12]:
# Update var_names with gene symbols but keep var.index as Ensembl IDs
ext_obj.var_names = ext_obj.var['gene_symbol']
ext_obj.var_names_make_unique()

In [13]:
# Create a mapping from Ensembl IDs to gene symbols
ensembl_to_symbol = ext_obj.var.set_index('ensembl_id')['gene_symbol'].to_dict()

In [14]:
sc.pp.pca(ext_obj, n_comps = 50)

computing PCA
    with n_comps=50
    finished (0:01:07)


In [15]:
sc.external.pp.harmony_integrate(ext_obj, key='Sample')

2024-10-21 14:15:56,173 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-10-21 14:16:07,734 - harmonypy - INFO - sklearn.KMeans initialization complete.
2024-10-21 14:16:08,444 - harmonypy - INFO - Iteration 1 of 10
2024-10-21 14:17:18,654 - harmonypy - INFO - Iteration 2 of 10
2024-10-21 14:18:22,625 - harmonypy - INFO - Iteration 3 of 10
2024-10-21 14:19:29,400 - harmonypy - INFO - Iteration 4 of 10
2024-10-21 14:20:43,892 - harmonypy - INFO - Iteration 5 of 10


KeyboardInterrupt: 

In [None]:
sc.pp.normalize_total(ext_obj, target_sum=1e4)

In [None]:
sc.pp.log1p(ext_obj)

In [None]:
sc.pp.scale(ext_obj, max_value=10)