In [1]:
import os
import scanpy as sc
import torch
import scvi
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from matplotlib.pyplot import rc_context
from qc_doubletremoval import qc_doubletremoval
from scipy.stats import median_abs_deviation as mad
torch.set_float32_matmul_precision('medium')

In [None]:
#parameter selection
accession_code = "GSE225381"
folder_path = 'raw_files/'
file_type = "mtx"
organism = "mouse"

In [None]:
# Define samples 
cirrhotic_samples = ["GSM4041162","GSM4041163","GSM4041165","GSM4041167"]
healthy_samples =["GSM4041151","GSM4041152","GSM4041154","GSM4041156","GSM4041157","GSM4041159"]

In [None]:
out = []
# Iterate over files in the folder
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    
    # Call the qc_doubletremoval function for each file
    processed_data = qc_doubletremoval(file_path,file_type, organism,sc,scvi,pd,np,mad, sns)
    
    # Add the 'Sample' annotation to the processed data
    processed_data.obs['Sample'] = os.path.splitext(file)[0]
    
    # Append the processed data to the list
    out.append(processed_data)

In [None]:
adata = sc.concat(out)
adata.obs_names_make_unique()
del out

In [None]:
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
adata

In [None]:
# Add features
adata.obs['Group'] = 'condition'
adata.obs.loc[adata.obs['Sample'].isin(cirrhotic_samples), 'Group'] = 'cirrhotic'
adata.obs.loc[adata.obs['Sample'].isin(healthy_samples), 'Group'] = 'healthy'

adata.obs['Patient'] = 'patientinfo'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041151","GSM4041152"]), 'Patient'] = 'H1'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041154"]), 'Patient'] = 'H2'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041156","GSM4041157"]), 'Patient'] = 'H3'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041159"]), 'Patient'] = 'H4'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041162","GSM4041163"]), 'Patient'] = 'C1'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041165"]), 'Patient'] = 'C2'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041167"]), 'Patient'] = 'C3'

adata.obs['Sex'] = 'sexinfo'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041151","GSM4041152","GSM4041154","GSM4041156",
                                       "GSM4041157","GSM4041165","GSM4041167"]), 'Sex'] = 'male'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041159","GSM4041162","GSM4041163"]), 'Sex'] = 'female'

adata.obs['Cause'] = 'causeinfo'
adata.obs.loc[adata.obs['Sample'].isin(healthy_samples), 'Cause'] = 'healthy'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041162","GSM4041163"]), 'Cause'] = 'nafld'
adata.obs.loc[adata.obs['Sample'].isin(["GSM4041165","GSM4041167"]), 'Cause'] = 'alcohol'

In [None]:
adata.obs.groupby('Sample').count()

In [None]:
adata.obs.groupby('Group').count()

In [None]:
adata.obs.groupby('Patient').count()

In [None]:
adata = adata[adata.obs['Sample'] != 'GSM4041152']
adata = adata[adata.obs['Sample'] != 'GSM4041151']
adata = adata[adata.obs['Sample'] != 'GSM4041157']
adata.obs.groupby('Group').count()

In [None]:
adata.write_h5ad(accession_code+'_combined.h5ad')