In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import os
import sys
from scipy import sparse
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sys.executable   
     

'/home/icb/shahana.dilruba/miniforge3/envs/pertpy/bin/python'

In [2]:
sc.set_figure_params(dpi=100, frameon=False, facecolor=None, transparent=True)

In [3]:
#Load Anndata
adata = sc.read('celltypist_model_chosen_lungatlas_data.h5ad')



In [26]:
adata

AnnData object with n_obs × n_vars = 254630 × 18115
    obs: 'identifier', 'patient', 'name', 'barcode_round', 'sample', 'viral_counts', 'treatment', 'treatment_virus', 'infected', 'n_counts', 'n_genes', 'percent_mito', 'percent_viral', 'leiden', 'final_bcs', 'batch', 'celltype_coarse', 'cell_type', 'infection_label', '_scvi_batch', '_scvi_labels', 'leiden_coarse', 'leiden_fine', 'leiden_subset', 'celltype_fine', 'celltype_approx', 'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score'
    var: 'used_for_scvi'
    uns: '_scvi', 'cell_type_colors', 'celltype_approx_colors', 'celltype_coarse_colors', 'celltype_fine_colors', 'dendrogram_leiden_fine', 'hvg', 'infected_colors', 'leiden', 'leiden_coarse_colors', 'leiden_colors', 'leiden_fine_colors', 'majority_voting_colors', 'neighbors', 'patient_colors', 'predicted_labels_colors', 'rank_genes_groups', 'treatment_colors', 'umap'
    obsm: 'X_pca', 'X_scVI', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances

In [37]:
adata.obs.identifier

muc21058_AAACCCAAGTTTGAGA    muc21058
muc21058_AAACCCACACGCGCAT    muc21058
muc21058_AAACCCACAGACACAG    muc21058
muc21058_AAACCCAGTATTGCCA    muc21058
muc21058_AAACCCAGTCGTATTG    muc21058
                               ...   
muc22968_TTTGTTGCATCACCAA    muc22968
muc22968_TTTGTTGGTTCCGTTC    muc22968
muc22968_TTTGTTGTCCACACCT    muc22968
muc22968_TTTGTTGTCTAACGGT    muc22968
muc22968_TTTGTTGTCTTGGAAC    muc22968
Name: identifier, Length: 254630, dtype: category
Categories (12, object): ['muc21058', 'muc21059', 'muc21060', 'muc21061', ..., 'muc22965', 'muc22966', 'muc22967', 'muc22968']

In [38]:
sample_names = adata.obs.identifier

In [39]:
# Create a DataFrame with sample names
df = pd.DataFrame({'SampleName': sample_names})

In [40]:
# Define the output CSV file path
output_csv_file = 'sample_names.csv'

In [41]:
# Write the DataFrame to a CSV file
df.to_csv(output_csv_file, index=False)

In [4]:
#Generate metadata
adata.obs['cell_type'].values.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
?,7248,0.028465
PNEC,48,0.000189
basal,53652,0.210706
basal prolif.,7048,0.027679
ciliated,9524,0.037403
club,67723,0.265966
deuterosomal,1611,0.006327
goblet,2603,0.010223
hillock,21544,0.084609
ionocytes,733,0.002879


In [5]:
df_meta = pd.DataFrame(data={'Cell':list(adata.obs.index),
                             'cell_type':[ i for i in adata.obs['cell_type']]
                            })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv('celltypist_lungatlas_model.tsv', sep = '\t')

In [6]:
#Compute DEGs (optional)
# Convert to dense matrix for Seurat
adata.X = adata.X.toarray()

In [7]:
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


In [9]:
from scipy import io

In [11]:
adata

AnnData object with n_obs × n_vars = 254630 × 18115
    obs: 'identifier', 'patient', 'name', 'barcode_round', 'sample', 'viral_counts', 'treatment', 'treatment_virus', 'infected', 'n_counts', 'n_genes', 'percent_mito', 'percent_viral', 'leiden', 'final_bcs', 'batch', 'celltype_coarse', 'cell_type', 'infection_label', '_scvi_batch', '_scvi_labels', 'leiden_coarse', 'leiden_fine', 'leiden_subset', 'celltype_fine', 'celltype_approx', 'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score'
    var: 'used_for_scvi'
    uns: '_scvi', 'cell_type_colors', 'celltype_approx_colors', 'celltype_coarse_colors', 'celltype_fine_colors', 'dendrogram_leiden_fine', 'hvg', 'infected_colors', 'leiden', 'leiden_coarse_colors', 'leiden_colors', 'leiden_fine_colors', 'majority_voting_colors', 'neighbors', 'patient_colors', 'predicted_labels_colors', 'rank_genes_groups', 'treatment_colors', 'umap'
    obsm: 'X_pca', 'X_scVI', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances

In [13]:
!mkdir matrix_files

In [15]:
with open('matrix_files/barcodes.tsv', 'w') as f:
    for item in adata.obs_names:
        f.write(item + '\n')

In [16]:
with open('matrix_files/features.tsv', 'w') as f:
    for item in ['\t'.join([x,x,'Gene Expression']) for x in adata.var_names]:
        f.write(item + '\n')

In [18]:
io.mmwrite('matrix_files/matrix', adata.X.T)

In [20]:
!ls matrix_files/

barcodes.tsv  features.tsv  matrix


In [21]:
!gzip matrix_files/*

In [15]:
!ls matrix_files/

barcodes.tsv  barcodes.tsv.gz  features.tsv  features.tsv.gz  matrix  matrix.gz


In [22]:
adata.obs.to_csv('metadata.csv')

In [24]:
adata.var_names

Index(['SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1', 'HES4', 'ISG15',
       'AGRN', 'RNF223', 'C1orf159',
       ...
       'S-minus', 'ORF3a-minus', 'E-minus', 'M-minus', 'ORF6-minus',
       'ORF7a-minus', 'ORF7b-minus', 'ORF8-minus', 'N-minus', 'ORF10-minus'],
      dtype='object', length=18115)

In [25]:
import csv

# Paths to the CSV and TSV files
csv_file_path = '/home/icb/shahana.dilruba/metadata.csv'
tsv_file_path = '/home/icb/shahana.dilruba/metadata.tsv'

# Open the CSV file for reading and TSV file for writing
with open(csv_file_path, 'r', newline='') as csv_file, open(tsv_file_path, 'w', newline='') as tsv_file:
    csv_reader = csv.reader(csv_file)
    tsv_writer = csv.writer(tsv_file, delimiter='\t')

    # Iterate over each row in the CSV file and write it to the TSV file
    for row in csv_reader:
        tsv_writer.writerow(row)
