In [None]:
# Step 1: download the data files
import os
from urllib import request
from pathlib import Path
import tarfile
                                                                                                            
# Prepare folder
os.makedirs("Data", exist_ok=True)
tar_path = Path("Data/GSE269269_RAW.tar")

# Use HTTPS instead of FTP
url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/GSE269269/suppl/GSE269269_RAW.tar"

# Download
print(f"Downloading {url} ...")
request.urlretrieve(url, tar_path)
print("Download finished:", tar_path, f"({tar_path.stat().st_size} bytes)")

# Extract
extract_folder = Path("Data/GSE269269_RAW")
extract_folder.mkdir(parents=True, exist_ok=True)

with tarfile.open(tar_path, "r") as tar:
    tar.extractall(path=extract_folder)

print("✅ Extracted to", extract_folder)
print("Sample files:", sorted(os.listdir(extract_folder))[:6])


Downloading https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/GSE269269/suppl/GSE269269_RAW.tar ...
Download finished: Data\GSE269269_RAW.tar (585830400 bytes)
✅ Extracted to Data\GSE269269_RAW
Sample files: ['GSM8311198_PR1_barcodes.tsv', 'GSM8311198_PR1_barcodes.tsv.gz', 'GSM8311198_PR1_features.tsv', 'GSM8311198_PR1_features.tsv.gz', 'GSM8311198_PR1_matrix.mtx', 'GSM8311198_PR1_matrix.mtx.gz']


In [57]:
#extracting the .tar file
# Step 2: extract tarball into Data/GSE269269_RAW
import tarfile
from pathlib import Path

tar_path = "Data/GSE269269_RAW.tar"
extract_folder = Path("Data/GSE269269_RAW")
extract_folder.mkdir(parents=True, exist_ok=True)

with tarfile.open(tar_path, "r") as tar:
    tar.extractall(path=extract_folder)

print("✅ Extracted to", extract_folder)
print("Example files:", sorted([p.name for p in extract_folder.glob("*")])[:12])


✅ Extracted to Data\GSE269269_RAW
Example files: ['GSM8311198_PR1_barcodes.tsv', 'GSM8311198_PR1_barcodes.tsv.gz', 'GSM8311198_PR1_features.tsv', 'GSM8311198_PR1_features.tsv.gz', 'GSM8311198_PR1_matrix.mtx', 'GSM8311198_PR1_matrix.mtx.gz', 'GSM8311199_PR2_barcodes.tsv', 'GSM8311199_PR2_barcodes.tsv.gz', 'GSM8311199_PR2_features.tsv', 'GSM8311199_PR2_features.tsv.gz', 'GSM8311199_PR2_matrix.mtx', 'GSM8311199_PR2_matrix.mtx.gz']


In [1]:
# creating adata per samples 
# Step 3: helper functions and loader (do not run before Step 2)
import os
import gzip
import shutil
from pathlib import Path
from collections import defaultdict
import pandas as pd
import scanpy as sc

def gunzip_file(file_path):
    """
    Unzip a .gz file if needed and return path to uncompressed file.
    If already unzipped, returns the same path.
    Accepts Path or str.
    """
    file_path = str(file_path)
    if file_path.endswith(".gz"):
        output_path = file_path[:-3]   # strips ".gz"
        if not os.path.exists(output_path):
            with gzip.open(file_path, "rb") as f_in, open(output_path, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        return output_path
    return file_path

def create_adata_from_geo_flat(folder, save_per_sample=False, per_sample_dir="Data/per_sample_h5ad"):
    """
    Read all *_barcodes.tsv.gz, *_features.tsv.gz, *_matrix.mtx.gz in `folder`,
    return list of AnnData objects. Each ad.obs['sample'] contains GSM ID (e.g. GSM8311198).
    If save_per_sample=True, saves each sample as Data/per_sample_h5ad/GSMxxxx.h5ad.
    """
    folder = Path(folder)
    files = list(folder.glob("*.gz"))
    if not files:
        raise FileNotFoundError(f"No .gz files found in {folder}")

    # group by prefix "GSMxxxx_short"
    groups = defaultdict(dict)
    for f in files:
        name = f.name
        prefix = "_".join(name.split("_")[:2])  # GSM8311198_PR1
        lname = name.lower()
        if "matrix" in lname and name.endswith(".mtx.gz"):
            groups[prefix]["matrix"] = f
        elif "barcodes" in lname:
            groups[prefix]["barcodes"] = f
        elif "features" in lname or "genes" in lname:
            groups[prefix]["features"] = f

    if save_per_sample:
        Path(per_sample_dir).mkdir(parents=True, exist_ok=True)

    adata_list = []
    for prefix in sorted(groups.keys()):
        paths = groups[prefix]
        if not all(k in paths for k in ("matrix", "barcodes", "features")):
            print(f"Skipping incomplete group: {prefix}")
            continue

        # gunzip files (idempotent)
        mtx_path = gunzip_file(paths["matrix"])
        bar_path = gunzip_file(paths["barcodes"])
        feat_path = gunzip_file(paths["features"])

        # load into AnnData (matrix is genes x cells -> transpose)
        ad = sc.read_mtx(mtx_path).T
        cells = pd.read_csv(bar_path, header=None, sep="\t")
        genes = pd.read_csv(feat_path, header=None, sep="\t")

        # assign names
        ad.obs_names = cells[0].astype(str).tolist()
        ad.var_names = genes[0].astype(str).tolist()
        ad.var["gene_ids"] = genes[0].astype(str).tolist()
        ad.var_names_make_unique()

        # keep GSM id in sample column (e.g. GSM8311198)
        gsm_id = prefix.split("_")[0]
        ad.obs["sample"] = gsm_id

        # optional save per-sample
        if save_per_sample:
            out = Path(per_sample_dir) / f"{gsm_id}.h5ad"
            ad.write(out)
            print("Saved individual sample to", out)

        adata_list.append(ad)
        print(f"Loaded {prefix}: cells={ad.n_obs}, genes={ad.n_vars}")

    return adata_list

# Example usage (do not run this block yet if you haven't extracted):
# adata_list = create_adata_from_geo_flat("Data/GSE269269_RAW", save_per_sample=False)


In [2]:
#merge all samples 
# Step 4: run loader and merge (this keeps GSM IDs in adata.obs['sample'])
extract_folder = "Data/GSE269269_RAW"   # folder created in Step 2
adata_list = create_adata_from_geo_flat(extract_folder, save_per_sample=False)

if not adata_list:
    raise SystemExit("No valid AnnData objects were created. Check files and grouping logic.")

# Preserve GSM IDs (order must match adata_list)
gsm_ids = [a.obs["sample"].iat[0] for a in adata_list]

adata = adata_list[0].concatenate(
    adata_list[1:],
    batch_key="sample",
    batch_categories=gsm_ids,  # <-- ensures adata.obs['sample'] contains GSM IDs (not 0/1/2)
    index_unique= None
)

out_path = "Data/combined_raw.h5ad"
adata.write(out_path)
print("✅ Saved merged AnnData to", out_path)
print("Merged shape:", adata.shape)


Loaded GSM8311198_PR1: cells=7233, genes=33694
Loaded GSM8311199_PR2: cells=9942, genes=33694
Loaded GSM8311200_PR3: cells=13500, genes=33694
Loaded GSM8311201_PR4: cells=12630, genes=33694
Loaded GSM8311202_PR5: cells=11825, genes=33694
Loaded GSM8311203_NPR1: cells=10355, genes=33694
Loaded GSM8311204_NPR2: cells=7611, genes=33694
Loaded GSM8311205_NPR3: cells=3433, genes=33694
Loaded GSM8311206_NPR4: cells=14504, genes=33694
Loaded GSM8311207_NPR5: cells=4588, genes=33694


  adata = adata_list[0].concatenate(
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


✅ Saved merged AnnData to Data/combined_raw.h5ad
Merged shape: (95621, 33694)


In [18]:
adata=sc.read_h5ad('Data/combined_raw.h5ad') 

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


In [19]:
adata


AnnData object with n_obs × n_vars = 95621 × 33694
    obs: 'sample', 'title', 'status', 'submission_date', 'last_update_date', 'type', 'channel_count', 'source_name_ch1', 'organism_ch1', 'taxid_ch1', 'characteristics_ch1.0.tissue', 'characteristics_ch1.1.subject status', 'molecule_ch1', 'extract_protocol_ch1', 'description', 'data_processing', 'platform_id', 'contact_email', 'contact_phone', 'contact_department', 'instrument_model', 'library_selection', 'library_source', 'library_strategy', 'relation', 'supplementary_file_1', 'supplementary_file_2', 'supplementary_file_3', 'series_id'
    var: 'gene_ids', 'gene_symbol', 'ensembl_id', 'feature_id'

In [20]:
adata.obs


Unnamed: 0,sample,title,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,taxid_ch1,...,contact_department,instrument_model,library_selection,library_source,library_strategy,relation,supplementary_file_1,supplementary_file_2,supplementary_file_3,series_id
AAACCCAAGGTTGGAC-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
AAACCCACAGTCCCGA-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
AAACCCACATACGCAT-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
AAACCCACATGCAGCC-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
AAACCCAGTCTTAGTG-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCAACCGACC-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
TTTGTTGCAACTCGAT-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
TTTGTTGGTCCAAATC-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269
TTTGTTGTCGACGACC-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Department of Cardiology,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269


In [39]:
adata.var

Unnamed: 0,gene_ids
ENSG00000243485,ENSG00000243485
ENSG00000237613,ENSG00000237613
ENSG00000186092,ENSG00000186092
ENSG00000238009,ENSG00000238009
ENSG00000239945,ENSG00000239945
...,...
ENSG00000277856,ENSG00000277856
ENSG00000275063,ENSG00000275063
ENSG00000271254,ENSG00000271254
ENSG00000277475,ENSG00000277475


In [4]:
print(adata.X.max())
print(adata.X.min())


27493.0
0.0


In [5]:
class addgenesymbol:
    
    def __init__(self, adata, organism):
        self.adata = adata
        self.organism = organism
#         self.log = log


    def add_feature_meta(self):
        if (self.organism.lower() == "homo sapiens"):
            feature_anno = pd.read_csv('hs_107.txt', sep='\t',dtype='str')
        elif (self.organism.lower() ==  "mus musculus"):
            feature_anno = pd.read_csv('mm_107.txt', sep='\t',dtype='str')
        elif (self.organism.lower() ==  "rattus norvegicus"):
            feature_anno = pd.read_csv('rn_107.txt', sep='\t',dtype='str')
        else:
            print("Data does not belong to selected organisms")


        for column in feature_anno.columns:
            if any(self.adata.var.index.isin(feature_anno[column])):
                index_col = column
                break
        feature_anno_corrected = pd.DataFrame()
        feature_anno_corrected.index=feature_anno[column].unique()
        for column in feature_anno.columns:
            if column != index_col:
                tmp_df=pd.DataFrame(feature_anno[column].groupby(feature_anno[index_col]).unique())
                feature_anno_corrected=pd.merge(feature_anno_corrected,tmp_df,how='left',left_index=True, right_index=True)              
            else:
                print("index col")



        adata_meta=self.adata.var

        adata_meta=pd.merge(adata_meta, feature_anno_corrected,left_index=True, right_index=True, how="left")
        
        self.adata.var=adata_meta
        
        
        cols_list = ["ensembl_gene_id", "hgnc_id", "hgnc_symbol", "mgi_id",	"mgi_symbol", "entrezgene_id", "gene_symbol"]
        
        for col_item in cols_list:
            if col_item in self.adata.var.columns:
                if not self.adata.var[col_item].str.contains('[', regex=False).all():
                    self.adata.var[col_item]= self.adata.var[col_item].apply(lambda x: f'["{x}"]' if ('[' not in str(x)) else str(x))
        
        if index_col == "ensembl_gene_id":
            self.adata.var['gene'] = self.adata.var['gene_symbol'].str.replace(r"['\[\]]", '', regex=True)
            self.adata.var['gene'] = self.adata.var['gene'].replace('nan', np.nan)
            self.adata.var['gene'].fillna(self.adata.var.index.to_series(), inplace=True)
            self.adata.var['ensembl_id'] = self.adata.var.index
            self.adata.var.set_index('gene', inplace=True)
            self.adata.var.index.name = 'gene'


        if index_col == "gene_symbol":
            self.adata.var['gene_symbol'] = self.adata.var.index.where(self.adata.var['ensembl_gene_id'] != '["nan"]')

        self.adata.var['feature_id'] = self.adata.var.index.copy()
        return self.adata

In [6]:
import numpy as np
organism = 'homo sapiens'
obj_feat = addgenesymbol(adata, organism)
adata = obj_feat.add_feature_meta()

index col


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.adata.var['gene'].fillna(self.adata.var.index.to_series(), inplace=True)


In [7]:
adata.var

Unnamed: 0_level_0,gene_ids,gene_symbol,ensembl_id,feature_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIR1302-2HG,ENSG00000243485,['MIR1302-2HG'],ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,['FAM138A'],ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,['OR4F5'],ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,[nan],ENSG00000238009,ENSG00000238009
ENSG00000239945,ENSG00000239945,[nan],ENSG00000239945,ENSG00000239945
...,...,...,...,...
ENSG00000277856,ENSG00000277856,[nan],ENSG00000277856,ENSG00000277856
ENSG00000275063,ENSG00000275063,[nan],ENSG00000275063,ENSG00000275063
ENSG00000271254,ENSG00000271254,[nan],ENSG00000271254,ENSG00000271254
ENSG00000277475,ENSG00000277475,[nan],ENSG00000277475,ENSG00000277475


In [8]:
%pip install GEOparse --quiet

Note: you may need to restart the kernel to use updated packages.


In [9]:
import GEOparse
obj=GEOparse.get_GEO(geo="GSE269269")
obj


16-Aug-2025 10:06:58 DEBUG utils - Directory ./ already exists. Skipping.
16-Aug-2025 10:06:58 INFO GEOparse - File already exist: using local version.
16-Aug-2025 10:06:58 INFO GEOparse - Parsing ./GSE269269_family.soft.gz: 
16-Aug-2025 10:06:58 DEBUG GEOparse - DATABASE: GeoMiame
16-Aug-2025 10:06:58 DEBUG GEOparse - SERIES: GSE269269
16-Aug-2025 10:06:58 DEBUG GEOparse - PLATFORM: GPL24676
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311198
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311199
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311200
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311201
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311202
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311203
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311204
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311205
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311206
16-Aug-2025 10:06:58 DEBUG GEOparse - SAMPLE: GSM8311207


<SERIES: GSE269269 - 10 SAMPLES, 1 d(s)>

In [10]:
meta=obj.phenotype_data
meta.head()

Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,taxid_ch1,...,instrument_model,library_selection,library_source,library_strategy,relation,supplementary_file_1,supplementary_file_2,supplementary_file_3,series_id,data_row_count
GSM8311198,PR1,GSM8311198,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
GSM8311199,PR2,GSM8311199,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
GSM8311200,PR3,GSM8311200,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
GSM8311201,PR4,GSM8311201,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
GSM8311202,PR5,GSM8311202,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,9606,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0


In [11]:
meta.rename(columns={'geo_accession':'sample'}, inplace=True)

In [12]:
adata.obs = adata.obs.reset_index().merge(meta, how='left', on='sample')
adata.obs

Unnamed: 0,index,sample,title,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,...,instrument_model,library_selection,library_source,library_strategy,relation,supplementary_file_1,supplementary_file_2,supplementary_file_3,series_id,data_row_count
0,AAACCCAAGGTTGGAC-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
1,AAACCCACAGTCCCGA-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
2,AAACCCACATACGCAT-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
3,AAACCCACATGCAGCC-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
4,AAACCCAGTCTTAGTG-1,GSM8311198,PR1,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95616,TTTGTTGCAACCGACC-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
95617,TTTGTTGCAACTCGAT-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
95618,TTTGTTGGTCCAAATC-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0
95619,TTTGTTGTCGACGACC-1,GSM8311207,NPR5,Public on Jun 12 2024,Jun 06 2024,Jun 12 2024,SRA,1,PBMC,Homo sapiens,...,Illumina NovaSeq 6000,cDNA,transcriptomic single cell,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8311...,GSE269269,0


In [13]:
adata.obs.drop(['contact_name','contact_institute','contact_address',
        'contact_city','contact_zip/postal_code','contact_country'], inplace = True, axis = 1)

In [14]:
adata.obs.drop(['data_row_count'], inplace = True, axis = 1)

In [15]:
adata.obs.set_index('index',inplace=True)
adata.obs.index.name=None

In [16]:
adata.var

Unnamed: 0_level_0,gene_ids,gene_symbol,ensembl_id,feature_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIR1302-2HG,ENSG00000243485,['MIR1302-2HG'],ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,['FAM138A'],ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,['OR4F5'],ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,[nan],ENSG00000238009,ENSG00000238009
ENSG00000239945,ENSG00000239945,[nan],ENSG00000239945,ENSG00000239945
...,...,...,...,...
ENSG00000277856,ENSG00000277856,[nan],ENSG00000277856,ENSG00000277856
ENSG00000275063,ENSG00000275063,[nan],ENSG00000275063,ENSG00000275063
ENSG00000271254,ENSG00000271254,[nan],ENSG00000271254,ENSG00000271254
ENSG00000277475,ENSG00000277475,[nan],ENSG00000277475,ENSG00000277475


In [17]:
adata.write('Data/combined_raw.h5ad',compression='lzf')