<a href="https://colab.research.google.com/github/StarryNight7210/Bioinformatics---scRNA-Sequencing-Analysis/blob/main/Alzheimer's_Project_data_combiner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import necessary modules

In [None]:
%pip install scanpy python-igraph leidenalg

# import packages
import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse
import os
import gc

sc.settings.verbosity = 3   # verbosity: errors (0), warnings (1), info (2), hints (3)

# customize resolution and color of your figures
sc.settings.set_figure_params(dpi=80, figsize=(4,4))



Collecting scanpy
  Downloading scanpy-1.11.4-py3-none-any.whl.metadata (9.2 kB)
Collecting python-igraph
  Downloading python_igraph-0.11.9-py3-none-any.whl.metadata (3.1 kB)
Collecting leidenalg
  Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.12.2-py3-none-any.whl.metadata (9.6 kB)
Collecting legacy-api-wrap>=1.4.1 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.2.3-py3-none-any.whl.metadata (3.4 kB)
Collecting igraph==0.11.9 (from python-igraph)
  Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting texttable>=1.6.2 (from igraph==0.11.9->python-igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting array-api-compat>=1.7.1 (from anndata>=0.8->scanpy)
  Downloading array_ap

Get ready to store data

In [None]:
# (!) Shell commands for creating folder and storing data

# Create a folder ()
!mkdir my_data
# Download count file from web (wget) and save to file named my_counts
!wget -O my_data/my_counts.tar "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE175814&format=file"

--2025-10-15 23:42:39--  https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE175814&format=file
Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 130.14.29.110, 2607:f220:41e:4290::110
Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|130.14.29.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 284180480 (271M) [application/x-tar]
Saving to: ‘my_data/my_counts.tar’


2025-10-15 23:42:47 (37.8 MB/s) - ‘my_data/my_counts.tar’ saved [284180480/284180480]



In [None]:
import subprocess
result = subprocess.run(["tar", "-xvf", "my_data/my_counts.tar"], capture_output=True, text=True)


Get the data from the online source, and compile into data frames, one for each patient

In [None]:
import pandas as pd
from scipy.io import mmread

# Load the count matrix
counts1 = mmread("/content/GSM5348374_A1_matrix.mtx.gz").tocsc()

# Load features (genes)
features1 = pd.read_csv("/content/GSM5348374_A1_features.tsv.gz", header=None, sep="\t")
genes1 = features1[1].values  # second column usually has gene symbols

# Load barcodes (cells)
barcodes1 = pd.read_csv("/content/GSM5348374_A1_barcodes.tsv.gz", header=None, sep="\t")
cells1 = barcodes1[0].values

# Build a DataFrame: rows = genes, columns = cells
counts_df1 = pd.DataFrame.sparse.from_spmatrix(counts1).T
counts_df1.index = cells1
counts_df1.columns = genes1

# Load the count matrix
counts2 = mmread("/content/GSM5348375_A2_matrix.mtx.gz").tocsc()

# Load features (genes)
features2 = pd.read_csv("/content/GSM5348375_A2_features.tsv.gz", header=None, sep="\t")
genes2 = features2[1].values  # second column usually has gene symbols

# Load barcodes (cells)
barcodes2 = pd.read_csv("/content/GSM5348375_A2_barcodes.tsv.gz", header=None, sep="\t")
cells2 = barcodes2[0].values

# Build a DataFrame: rows = genes, columns = cells
counts_df2 = pd.DataFrame.sparse.from_spmatrix(counts2).T
counts_df2.index = cells2
counts_df2.columns = genes2

# Load the count matrix
counts3 = mmread("/content/GSM5348376_A3_matrix.mtx.gz").tocsc()

# Load features (genes)
features3 = pd.read_csv("/content/GSM5348376_A3_features.tsv.gz", header=None, sep="\t")
genes3 = features3[1].values  # second column usually has gene symbols

# Load barcodes (cells)
barcodes3 = pd.read_csv("/content/GSM5348376_A3_barcodes.tsv.gz", header=None, sep="\t")
cells3 = barcodes3[0].values

# Build a DataFrame: rows = genes, columns = cells
counts_df3 = pd.DataFrame.sparse.from_spmatrix(counts3).T
counts_df3.index = cells3
counts_df3.columns = genes3

# Load the count matrix
counts4 = mmread("/content/GSM5348377_A4_matrix.mtx.gz").tocsc()

# Load features (genes)
features4 = pd.read_csv("/content/GSM5348377_A4_features.tsv.gz", header=None, sep="\t")
genes4 = features4[1].values  # second column usually has gene symbols

# Load barcodes (cells)
barcodes4 = pd.read_csv("/content/GSM5348377_A4_barcodes.tsv.gz", header=None, sep="\t")
cells4 = barcodes4[0].values

# Build a DataFrame: rows = genes, columns = cells
counts_df4 = pd.DataFrame.sparse.from_spmatrix(counts4).T
counts_df4.index = cells4
counts_df4.columns = genes4

Assign sources and make combined data frames for patients 1 and 2 and patients 3 and 4

In [None]:
# Before making combined data frames, label each patient's data with which patient it came from
# So that once the data is combined, we can still see what data came from where

counts_df1["Source"] = 1
counts_df2["Source"] = 2

counts_df3["Source"] = 3
counts_df4["Source"] = 4

# Make the combined data frames

counts_df_A = pd.concat([counts_df1, counts_df2], ignore_index=False)
counts_df_B = pd.concat([counts_df3, counts_df4], ignore_index=False)

Randomly sample half of each of the combined data frames

In [None]:
proportion = 0.5

sampled_df_A = counts_df_A.sample(n= int(proportion * counts_df_A.shape[0]), random_state=42)
sampled_df_B = counts_df_B.sample(n= int(proportion * counts_df_B.shape[0]), random_state=42)

(6242, 33539)


Combine the two data frames so that we have all 4 patients in one data frame but only half the data, so we don't exceed RAM limits

In [None]:
counts_df = pd.concat([sampled_df_A, sampled_df_B], ignore_index=False)

Export to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

counts_df.to_pickle('/content/drive/MyDrive/counts_df.pkl')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
