In [14]:
!pip install -r requirements.txt

Collecting scanpy (from -r requirements.txt (line 3))
  Using cached scanpy-1.11.4-py3-none-any.whl.metadata (9.2 kB)
Collecting anndata>=0.8 (from scanpy->-r requirements.txt (line 3))
  Using cached anndata-0.12.1-py3-none-any.whl.metadata (9.6 kB)
Collecting h5py>=3.7.0 (from scanpy->-r requirements.txt (line 3))
  Using cached h5py-3.14.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting joblib (from scanpy->-r requirements.txt (line 3))
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting legacy-api-wrap>=1.4.1 (from scanpy->-r requirements.txt (line 3))
  Using cached legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting natsort (from scanpy->-r requirements.txt (line 3))
  Using cached natsort-8.4.0-py3-none-any.whl.metadata (21 kB)
Collecting networkx>=2.7.1 (from scanpy->-r requirements.txt (line 3))
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting numba>=0.57.1 (from scanpy->-r requirements

In [13]:
import requests
from tqdm import tqdm
import os

# File URL and save location
url = "https://datasets.cellxgene.cziscience.com/d25acbe9-9804-48ba-9e40-10beee03eb25.h5ad"
save_dir = "data/"
filename = "dataset.h5ad"

# Make sure the directory exists
os.makedirs(save_dir, exist_ok=True)
filepath = os.path.join(save_dir, filename)

# Stream download
response = requests.get(url, stream=True)
response.raise_for_status()

# Total size in bytes
total_size = int(response.headers.get("Content-Length", 0))
block_size = 1024  # 1 KB

# Progress bar
with open(filepath, "wb") as file, tqdm(
    total=total_size, unit="B", unit_scale=True, desc=filename
) as progress_bar:
    for data in response.iter_content(block_size):
        file.write(data)
        progress_bar.update(len(data))

print(f"Download complete: {filepath}")

dataset.h5ad: 100%|██████████| 6.51G/6.51G [06:08<00:00, 17.6MB/s]  


Download complete: data/dataset.h5ad


In [16]:
import scanpy as sc

# Load the file
adata = sc.read_h5ad("data/dataset.h5ad")

In [56]:
diseased = adata.obs[adata.obs["disease"] == "Alzheimer disease"]
normal = adata.obs[adata.obs["disease"] != "Alzheimer disease"]
diseased.shape[0], normal.shape[0]

(310773, 113755)

In [66]:
diseased_microglial = diseased[diseased["cell_type"] == "microglial cell"]
normal_microglial = normal[normal["cell_type"] == "microglial cell"]

diseased_microglial.shape[0], normal_microglial.shape[0]

(4282, 789)

In [97]:
diseased_names = list(diseased_microglial.index)
normal_names = list(normal_microglial.index)

In [98]:
ids = adata.obs_names.get_indexer(diseased_names)
normal_ids = adata.obs_names.get_indexer(normal_names)

In [102]:
import pandas as pd
diseased_df = pd.DataFrame(adata.X[ids].toarray(), columns = adata.var.feature_name)
normal_df = pd.DataFrame(adata.X[normal_ids].toarray(), columns = adata.var.feature_name)

In [104]:
genes_mean_expression = diseased_df.mean(axis=0)
normal_genes_mean_expression = normal_df.mean(axis=0)

In [105]:
expression_to_gene = {v:k for k, v in genes_mean_expression.items()}
normal_expression_to_gene = {v:k for k, v in normal_genes_mean_expression.items()}

In [113]:
sorted_expression = sorted(expression_to_gene.keys(), reverse=True)
for i in range(10):
    print(f"{i} most expressed gene:{expression_to_gene[sorted_expression[i]]}")

0 most expressed gene:MALAT1
1 most expressed gene:PLXDC2
2 most expressed gene:DOCK4
3 most expressed gene:LRMDA
4 most expressed gene:FRMD4A
5 most expressed gene:NEAT1
6 most expressed gene:ARHGAP24
7 most expressed gene:MEF2A
8 most expressed gene:ELMO1
9 most expressed gene:ST6GAL1


In [114]:
normal_sorted_expression = sorted(normal_expression_to_gene.keys(), reverse=True)
for i in range(10):
    print(f"{i} most expressed gene:{normal_expression_to_gene[normal_sorted_expression[i]]}")

0 most expressed gene:MALAT1
1 most expressed gene:PLXDC2
2 most expressed gene:DOCK4
3 most expressed gene:FRMD4A
4 most expressed gene:LRMDA
5 most expressed gene:NEAT1
6 most expressed gene:ARHGAP24
7 most expressed gene:SRGAP2
8 most expressed gene:MEF2A
9 most expressed gene:CHST11
