In [None]:
# 1. Montar Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install kneed
!pip install --upgrade scvi-tools
!pip install kneed
!pip install scrublet

In [None]:
!pip install scanpy

In [None]:
# Load the required modules
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import scvi
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
import warnings
from scipy.stats import median_abs_deviation
import os
import pickle as pkl
from kneed import KneeLocator as kl
import scrublet as scr
import os
# Ignore the warning messages
warnings.filterwarnings("ignore")

In [None]:
from matplotlib.pyplot import rc_context

In [None]:
# Ruta de la carpeta
ruta_carpeta = "./adatas"

# Lista para almacenar los objetos AnnData
adatas = []

# Recorrer todos los archivos en la carpeta
for archivo in os.listdir(ruta_carpeta):
    ruta_archivo = os.path.join(ruta_carpeta, archivo)
    # Verificar si es un archivo válido para AnnData (e.g., .h5ad, .loom, .mtx, etc.)
    if os.path.isfile(ruta_archivo) and archivo.endswith((".h5ad", ".loom", ".mtx")):
        try:
            # Cargar el archivo como AnnData
            adata = sc.read_h5ad(ruta_archivo)
            adata.var_names_make_unique()
            adata.obs_names_make_unique()
            adatas.append(adata)
            print(f"Cargado: {archivo}")
        except Exception as e:
            print(f"Error al cargar {archivo}: {e}")

# Resultado: lista de objetos AnnData
print(f"Se cargaron {len(adatas)} archivos AnnData.")


In [None]:
for adata in adatas:
    print(adata.obs['Sample'].unique())

In [None]:
adatas

In [None]:
gene_sets = [set(adata.var_names) for adata in adatas]

# Encontrar intersección de todos los genes
shared_genes = set.intersection(*gene_sets)

# Resultado: número de genes compartidos
print(f"Número de genes compartidos: {len(shared_genes)}")

In [None]:
adata = sc.concat(adatas)

In [None]:
#adata = adatas[0]

In [None]:
adata

In [None]:
adata_d = adata.copy()

In [None]:
adata_d

In [None]:
sc.pp.filter_cells(adata_d, min_genes = 500)
sc.pp.filter_genes(adata_d, min_cells = 3)
sc.pp.highly_variable_genes(adata_d, n_top_genes = 2000, subset = True, flavor = 'seurat_v3')
print(f'Comenzamos con {adata_d.n_obs} celulas')
#Entrenamos el modelo
scvi.model.SCVI.setup_anndata(adata_d)
vae = scvi.model.SCVI(adata_d)
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()

#Predecimos los doublets
df = solo.predict()
df['prediction'] = solo.predict(soft = False)
#df.index = df.index.map(lambda x: x[:-2])
display(df)
n_doublets = len(df[df.prediction == 'doublet'])
print(f'Se han detectado {n_doublets} doublets')
#Visualizamos los doublets
#THIS STEP IS NOT NECESSARY, just to visuallize results
adata_d.obs['prediction'] = df.prediction
sc.pp.normalize_total(adata_d, target_sum = 1e4)
sc.pp.log1p(adata_d)
sc.tl.pca(adata_d)
sc.pp.neighbors(adata_d)
sc.tl.umap(adata_d)
sc.tl.leiden(adata_d, resolution = 0.5)

with rc_context({'figure.figsize': (4, 4)}):
    sc.pl.umap(adata_d, color = ['leiden', 'prediction'])


In [None]:
# Create a column with the difference of the scores between doublet and singlet
df['dif'] = df.doublet - df.singlet

# Show the distribution of this parameter for barcodes predicted to be doublets
sns.displot(df[df.prediction == 'doublet'], x = 'dif')
plt.title(f"Ddoublet vs singlet predicted score distribution", fontweight=800)

# Save a list with the barcodes predicted to be doublets (with a great score difference)
doub_l = df[(df.prediction == 'doublet') & (df.dif > 0.5)].index.tolist()

In [None]:
adata

In [None]:
sc.pp.filter_cells(adata, min_genes = 500)
sc.pp.filter_genes(adata, min_cells = 3)

In [None]:
adata

In [None]:
adata.obs['doublet'] = adata.obs.index.isin(doub_l)
adata= adata[~adata.obs.doublet].copy()

In [None]:
print(f'Terminamos con {adata.n_obs} celulas')

In [None]:
adata

Guardamos adata tras el filtrado de doublets

In [None]:
adata.write('adata_dr.h5ad')

### QC FILTERING

In [None]:
adata = sc.read_h5ad('adata_dr.h5ad')

In [None]:
adata

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [None]:
#Calculate the QC covariates or metric

# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))

In [None]:
#Calculate the respective QC metrics
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata

In [None]:
sns.displot(adata.obs["total_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
sc.pl.violin(adata, "pct_counts_mt")
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)
adata.obs.outlier.value_counts()

In [None]:
adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (adata.obs["pct_counts_mt"] > 40)

In [None]:
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

In [None]:
p1 = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

CHECKPOINT

In [None]:
adata.write('adata_postqc')

In [None]:
adata

## NORMALIZACION

In [None]:
adata.layers['counts'] = csr_matrix(adata.X) # Save the non normalized data in a compressed matrix

adata.raw = adata # Save the non normalized data counts in the raw atribute
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# Select and compute highly variable genes
sc.pp.highly_variable_genes(adata,n_top_genes=2000)

# Plot variable genes
sc.pl.highly_variable_genes(adata)

# Actually do the filtering and subset for variable genes in the dataset
adata = adata[:, adata.var.highly_variable].copy()

adata.layers['hv_counts'] = adata.X.copy() # Save the normalized highly variable counts in a new layer

In [None]:
# Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed.
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'],n_jobs=30)

In [None]:
# Scale the data to unit variance.
sc.pp.scale(adata, max_value=10)

# Reduce the dimensionality of the data by running principal component analysis (PCA), which reveals the main axes of variation and denoises the data.
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
# Define new function that finds the elbow dimension
def PCA_Elbow_fit(data):
    model = PCA().fit(data)
    explained_variance = model.explained_variance_ratio_
    pcs = list(range(1, explained_variance.shape[0]+1))#enumerate(explained_variance,1)
    klm = kl(pcs, explained_variance, S=1.0, curve='convex', direction='decreasing')
    pcs_used = klm.knee
    pc_list = list(range(1, pcs_used+1))
    new_data = PCA(n_components= pcs_used, svd_solver='arpack').fit_transform(data)

    if pcs_used >50:
        pcs_used=50

    return pcs_used, new_data, pcs, explained_variance, pc_list

In [None]:
# Extract cell x gene data from scanpy annData and create new pandas dataframe

new_frame = pd.DataFrame(adata.X, index=adata.obs_names, columns=adata.var_names)

pandas_data = new_frame.values

# Execute PCA elbow_fitting_funcion
dim,new_matrix,pc_ax,pc_ay,col_labels=PCA_Elbow_fit(pandas_data)
print(dim)
# Neigbours computation and umap
sc.pp.neighbors(adata, n_pcs = dim) # compute nearest neighbors
sc.tl.umap(adata)

# Plot UMAP
sc.pl.umap(adata, color = ['Sample'], frameon = False,title="NON INTEGRATED SAMPLES")

In [None]:
sc.pl.umap(adata, color = ['Sample'], legend_loc = 'on data', frameon = False,title="NON INTEGRATED SAMPLES")

In [None]:
adata.write('normalized.h5ad')

### INTEGRACION

In [None]:
!pip install harmonypy

In [None]:
file_path = "/content/drive/MyDrive/TFM_DEF/normalized.h5ad"
adata = sc.read_h5ad(file_path)

In [None]:
adata = sc.read_h5ad('normalized.h5ad')

In [None]:
import scanpy.external as sce

In [None]:
sce.pp.harmony_integrate(adata, 'Sample')

In [None]:
adata

In [None]:
# Compute neighbors using scVI model
sc.pp.neighbors(adata, use_rep = 'X_pca_harmony', metric="correlation")


In [None]:
!pip install igraph

In [None]:
!pip install leidenalg

In [None]:
# Generate UMAP and leiden clustering
sc.tl.umap(adata, min_dist=0.4)
sc.tl.leiden(adata, resolution = 0.7) #key_added="leiden_scvi"

print(f"Plotting integrated data UMAP:")

sc.pl.umap(adata, color = ['Sample'], frameon = False,title="INTEGRATED SAMPLES")
sc.pl.umap(adata, color = ['leiden'],legend_loc = 'on data', frameon = False,title="LEIDEN CLUSTERING")

In [None]:
adata.write('adata_batch_corrected.h5ad')

In [None]:
output_path = "/content/drive/MyDrive/TFM_DEF/adata_batch_corrected.h5ad"
adata.write(output_path)

## ANNOTATION

In [None]:
adata

In [None]:
# Define a function that gives a new label to the cells of a cluster (for manual annotation)

def relabel_cluster_celltypes(adata,new_label,target_clusters,cluster_key="leiden",cell_type_key="cell_type",inplace=True,new_column_key= None):

    values_to_change = adata.obs[cluster_key].isin(target_clusters)

    celltypeslist = adata.obs[cell_type_key].to_numpy()

    celltypeslist[values_to_change] = new_label

    if inplace:

        adata.obs[cell_type_key] = celltypeslist

    else:
        adata.obs[new_column_key] = celltypeslist

### FINAL ANNOTATION

In [None]:
adata.obs['cell_type'] = 'Unknown'

## T cells / NK

In [None]:
sc.pl.umap(adata, color=['CD3D','CD3E', 'GZMA', 'PRF1', 'KLRD1'], vmax=1)

### Cĺúster Macrofagos

In [None]:
sc.pl.umap(adata, color=['CD68', 'C1QB'])

### Cĺúster Monocitos

In [None]:
sc.pl.umap(adata, color=['FCN1', 'EREG'])

### Clúster Ductal

In [None]:
sc.pl.umap(adata, color=['CFTR'])

### Mastocitos

In [None]:
sc.pl.umap(adata, color=['CPA3', 'KIT'])

### Fibroblast

In [None]:
sc.pl.umap(adata, color=['COL1A1','ACTA2','COL1A2','PDGFRA', 'PDPN'], vmax = 3)

### Cluster B cells

In [None]:
sc.pl.umap(adata, color=['CD79A', 'MS4A1', 'JCHAIN', 'MZB1', 'CD79B', 'IGHA2', 'CCR10', 'IGHG2'])

### Clúster Endothelial

In [None]:
sc.pl.umap(adata, color=['CDH5', 'VWF', 'PLVAP', 'PECAM1'])

### Endocrine cells

In [None]:
sc.pl.umap(adata, color=['INS', 'GCG'])

### Stellate cells

In [None]:
sc.pl.umap(adata, color=['C11orf96', 'ADIRF', 'SPARCL1', 'RGS5', 'MYH11'], vmax = 3)

### Schawn cells

In [None]:
sc.pl.umap(adata, color=['CRYAB', 'ITGB8', 'GPM6B', 'CDH19', 'S100B'], vmax=1)

### TUMOR CELLS

In [None]:
sc.pl.umap(adata, color=['LAMA3', 'KRT17', 'CEACAM6', 'SPINK1', 'SOX9'], vmax = 2)

In [None]:
relabel_cluster_celltypes(adata,new_label="T cells",target_clusters=["0","3","5","11","13","19"])

In [None]:
relabel_cluster_celltypes(adata,new_label="PMNs",target_clusters=["4", "6", "15", "16"])

In [None]:
relabel_cluster_celltypes(adata,new_label="Mast cells",target_clusters=["7"])

In [None]:
relabel_cluster_celltypes(adata,new_label="Fibroblasts",target_clusters=["1","21"])

In [None]:
relabel_cluster_celltypes(adata,new_label="B cells",target_clusters=["9"])

In [None]:
relabel_cluster_celltypes(adata,new_label="Endothelial cells",target_clusters=["10"])

In [None]:
relabel_cluster_celltypes(adata,new_label="Endocrine cells",target_clusters=["20"])

In [None]:
relabel_cluster_celltypes(adata,new_label="Stellate cells",target_clusters=["12"])

In [None]:
relabel_cluster_celltypes(adata,new_label="Schwann cells",target_clusters=["18"])

In [None]:
relabel_cluster_celltypes(adata,new_label="Tumor cells",target_clusters=["2","8", "14", "17"])

In [None]:
# Remove the old color information for 'cell_type' to force scanpy to generate a new palette
if 'cell_type_colors' in adata.uns:
    del adata.uns['cell_type_colors']

In [None]:
adata.write('adata_anotado.h5ad')

In [None]:
adata.obs['cell_type'].unique()

In [None]:
output_path = "/content/drive/MyDrive/TFM_DEF/GSE217845.h5ad"
adata.write(output_path)