# SC data
The data is obtained from https://www.weizmann.ac.il/sites/3CA/lung
We used the Dataset from https://www.nature.com/articles/s41388-021-02054-3

this script is used to load the data to Anndata format and save it.
Please adapt the paths accordingly

We recommend using the tmp_enact_analysis environment

In [None]:
import numpy as np
import pandas as pd
from scipy import io
import matplotlib.pyplot as plt
import scanpy as sc
import anndata
import tarfile

In [None]:
!curl https://www.dropbox.com/scl/fi/08bjhmr5b7zcr9w3qm37n/Data_Bischoff2021_Lung.tar.gz?rlkey=xlv2vwb5xenzczoo9jiktr1ry&dl=1
!curl https://www.dropbox.com/scl/fi/5phcsqxntjdjjfxypf4l3/Meta-data_Bischoff2021_Lung.tar.gz?rlkey=gmn535v9fqtqaqibvjria1lks&dl=1

In [None]:
sc_reference_path = "/srv/GT/analysis/rdegottardi/data/SC_Lung_AC/Data_Bischoff2021_Lung.tar.gz"
sc_metedata_path = "/srv/GT/analysis/rdegottardi/data/SC_Lung_AC/Meta-data_Bischoff2021_Lung.tar.gz"


In [None]:
# Extract and load the single-cell reference data from the tar.gz archive
with tarfile.open(sc_reference_path, "r:gz") as tar:
    # Find the matrix, genes, and cells files
    mtx_member = tar.getmember("Data_Bischoff2021_Lung/Exp_data_UMIcounts.mtx")
    genes_member = tar.getmember("Data_Bischoff2021_Lung/genes.txt")
    cell_member = tar.getmember("Data_Bischoff2021_Lung/Cells.csv")
    
    # Open the files
    with tar.extractfile(mtx_member) as mtx_file, \
         tar.extractfile(genes_member) as genes_file, \
         tar.extractfile(cell_member) as cells_file:
        
        # Load expression matrix
        expr_mtx = io.mmread(mtx_file).tocoo()
        
        # Load gene names
        gene_names = [line.decode().strip() for line in genes_file]
        
        # Load cell names
        cell_names = pd.read_csv(cells_file)['cell_name'].tolist()

In [None]:
# Extract and load the metadata from the tar.gz archive
with tarfile.open(sc_metedata_path, "r:gz") as tar:
    meta_member = tar.getmember("Meta-data_Bischoff2021_Lung/Cells.csv")
    with tar.extractfile(meta_member) as meta_file:
        metadata = pd.read_csv(meta_file, index_col=0)

In [None]:
# Create AnnData object from the expression matrix, gene names, and cell names
adata = anndata.AnnData(
    X=expr_mtx.transpose().tocsr(),  # shape: cells x genes
    obs=metadata,                    # metadata as obs
    var=pd.DataFrame(index=gene_names)
)

In [None]:
adata.write("/srv/GT/analysis/rdegottardi/data/SC_Lung_AC/adata.h5ad")
# the data was moved to g-store and can now be found under /srv/gstore/projects/p37785/SC_Lung_AC/adata.h5ad