# Gene Regulatory Network Analysis: EDA

This notebook demonstrates the data loading, quality control, and preprocessing steps for GRN analysis.

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
from src.data_preparation import ExpressionDataPreparer, BatchProcessor

## 1. Data Loading

Load RNA-seq expression data and transcription factor information

In [None]:
# Initialize data preparer
preparer = ExpressionDataPreparer(
    mean_tpm_thresh=1.0,
    min_genes=1,
    min_obs=1,
    variance_quantile=0.0,
    random_seed=42
)

In [None]:
# Load raw expression data
data_path = "path/to/your/expression_data.h5ad"  # Replace with your data path
adata = preparer.load_expression_data(data_path)
print(f"Raw data shape: {adata.shape}")

## 2. Quality Control

Examine data quality and distribution

In [None]:
# Plot expression distribution
plt.figure(figsize=(10, 6))
pd.Series(adata.X.toarray().flatten()).hist(bins=100)
plt.title('Raw Expression Distribution')
plt.xlabel('Expression Value')
plt.ylabel('Frequency')

In [None]:
# Plot gene detection rate
plt.figure(figsize=(10, 6))
sc.pp.calculate_qc_metrics(adata, inplace=True)
adata.var['n_cells'].hist(bins=50)
plt.title('Gene Detection Distribution')
plt.xlabel('Number of Cells')
plt.ylabel('Number of Genes')

## 3. Data Preprocessing

Apply filtering and normalization steps

In [None]:
# Filter data
adata_filtered = preparer.filter_data(adata)
print(f"Filtered data shape: {adata_filtered.shape}")

In [None]:
# Plot filtered expression distribution
plt.figure(figsize=(10, 6))
pd.Series(adata_filtered.X.toarray().flatten()).hist(bins=100)
plt.title('Filtered Expression Distribution')
plt.xlabel('Expression Value')
plt.ylabel('Frequency')

## 4. Dimensionality Reduction

Examine data structure using PCA

In [None]:
# Compute PCA
adata_pca, n_pcs = preparer.compute_pca(adata_filtered)
print(f"Optimal number of PCs: {n_pcs}")

In [None]:
# Plot PCA variance ratios
plt.figure(figsize=(10, 6))
plt.plot(adata_pca.uns['pca']['variance_ratio'])
plt.axvline(n_pcs, color='r', linestyle='--')
plt.title('PCA Variance Ratio')
plt.xlabel('PC')
plt.ylabel('Variance Ratio')

## 5. Batch Processing

Process multiple tissue datasets

In [None]:
# Initialize batch processor
batch_processor = BatchProcessor(preparer)

# Process multiple tissues
tissues = ['root', 'leaf', 'seed', 'shoot']
processed_dfs = batch_processor.process_multiple_tissues(
    input_dir="path/to/input",  # Replace with your input directory
    output_dir="path/to/output",  # Replace with your output directory
    tissues=tissues
)

# Display results
for tissue, df in processed_dfs.items():
    print(f"\n{tissue} dataset shape: {df.shape}")

## 6. Save Processed Data

Save the processed data for downstream analysis

In [None]:
# Save processed data
output_path = "path/to/processed_data.h5ad"  # Replace with your output path
adata_filtered.write(output_path)
print(f"Processed data saved to: {output_path}")