# Data Ingestion

This notebook performs the initial data loading and creates an AnnData object for scRNA-seq analysis.



## Import Required Libraries

In [None]:
import os
import gc
import scanpy as sc
import pandas as pd
import numpy as np
from pathlib import Path

## Configure Environment

In [None]:
# Configure Scanpy settings
sc.settings.verbosity = 3  # Show more output by default
sc.settings.set_figure_params(dpi=100, figsize=(8, 8))
np.random.seed(42)

# Project Configuration and paths
PROJ_NAME = ""
PROJ_DESCRIPTION = ""
FULL_PROJ_NAME = f"{PROJ_NAME}_{PROJ_DESCRIPTION}"

PROJECT_DIR = Path("/path/to/project")
OUTPUT_DIR = PROJECT_DIR / "output"

## Data Loading Functions

In [None]:
def read_and_process_data():
    """
    Read and process the raw count data and metadata
    Returns:
        AnnData: Processed data object
    """
    print("Reading data files...")
    
    # Read metadata and count data
    metadata = pd.read_csv(DATA_DIR / "metadata.csv")
    raw_counts = pd.read_csv(DATA_DIR / "raw_counts.csv")

    # Process data
    cells = metadata['Cell'].astype(str).values
    gene_names = raw_counts['Geneid'].values
    count_matrix = raw_counts[['Geneid'] + list(cells)]
    
    # Create AnnData object
    adata = sc.AnnData(X=count_matrix.iloc[:, 1:].T)
    adata.var_names = gene_names
    adata.obs_names = cells
    
    # Add metadata
    adata.obs['treatment'] = metadata['Treatment'].values
    adata.obs['location'] = metadata['Location'].values
    adata.obs['gender'] = metadata['Gender'].values
    adata.obs['batch'] = metadata['Batch'].values
    
    return adata

In [None]:
def validate_data(adata):
    """
    Validate the loaded data and print basic statistics
    Args:
        adata: AnnData object to validate
    Returns:
        AnnData: Validated data object
    """
    print("\nDataset dimensions:")
    print(f"Number of cells: {adata.n_obs}")
    print(f"Number of genes: {adata.n_vars}")
    
    print("\nSample metadata:")
    print(adata.obs.head())
    
    print("\nPreview of count matrix (first 5 cells, first 5 genes):")
    print(adata.X[:5, :5])
    
    # Calculate basic quality metrics
    sc.pp.calculate_qc_metrics(
        adata,
        percent_top=None,
        log1p=False,
        inplace=True
    )
    
    return adata

## Data Loading and Saving

In [None]:
print(f"Starting data loading for project: {FULL_PROJ_NAME}")

# Clear memory
gc.collect()

try:
    # Read and process data
    adata = read_and_process_data()
    
    # Validate data
    adata = validate_data(adata)
    
    # Save the AnnData object
    output_file = OUTPUT_DIR / f"{FULL_PROJ_NAME}_raw.h5ad"
    print(f"\nSaving AnnData object to: {output_file}")
    adata.write(output_file)
    
    print("Data loading complete!")
    
except Exception as e:
    print(f"An error occurred: {str(e)}")
    raise