In [1]:
import scanpy as sc
import harmonypy as hm
import pandas as pd
import anndata as ad
import numpy as np

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
#import seaborn as sns

DPI=300
FONTSIZE=20 #42

random_state = 7

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(scanpy = True, dpi=80, transparent=True, vector_friendly = True, dpi_save=DPI) 
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42



In [2]:
# %%
GSE123813 = sc.read("/data/BCI-SingleCell/SCC_Atlas/Sam_Nicholls/Unzipped files/GSE123813/scc_scRNA_counts.txt", cache=True)
GSE123813 = GSE123813.transpose()

# %%
metadata_GSE123813 = pd.read_csv("/data/BCI-SingleCell/SCC_Atlas/Sam_Nicholls/Unzipped files/GSE123813/scc_metadata.txt", sep='\t', index_col=0)

# %%
metadata = pd.DataFrame(metadata_GSE123813)
# Strip whitespace and force to string
cleaned_metadata_index = metadata_GSE123813.index.astype(str).str.strip()
# Extract final ACGT blocks
metadata_suffixes = cleaned_metadata_index.str.extract(r'([ACGT]{10,})$')[0]
metadata['Barcode'] = list(metadata_suffixes)


# %%
GSE123813.obs.index

# %%
cleaned_obs_names = pd.Series(GSE123813.obs_names.astype(str)).str.strip()
obs_suffixes = cleaned_obs_names.str.extract(r'([ACGT]{10,})$')[0]
GSE123813.obs_names = list(obs_suffixes)

#Filter metadata to only those barcodes in adata
metadata = metadata.drop_duplicates(subset='Barcode', keep='first')
#Set it as the index
metadata.set_index('Barcode', inplace=True)
metadata = metadata.reindex(GSE123813.obs_names)

# Merge metadata into adata.obs
GSE123813.obs = pd.concat([GSE123813.obs, metadata], axis=1)

... reading from cache file cache/data-BCI-SingleCell-SCC_Atlas-Sam_Nicholls-Unzipped files-GSE123813-scc_scRNA_counts.h5ad


In [3]:
GSE123813.obs['Condition'] = 'Tumor'

In [10]:
GSE123813.obs

Unnamed: 0,Patient,treatment,cluster,UMAP1,UMAP2,Condition
AAACCTGCATGTTCCC,su010,pre,CD8_naive,-5.055816,3.093178,Tumor
AAACGGGAGGACAGAA,su010,pre,CD8_naive,-4.361540,2.970602,Tumor
AAACGGGCAAGTCTAC,su010,pre,Th17,2.955230,-4.796993,Tumor
AAACGGGGTAGTACCT,su010,pre,CD8_naive,-4.597772,2.647269,Tumor
AAACGGGGTGATAAAC,su010,pre,CD8_naive,-3.909451,3.647786,Tumor
...,...,...,...,...,...,...
TTTACTGTCACTCCTG,su014,post,CD8_eff,-5.676641,-0.858432,Tumor
TTTCCTCCATCGGAAG,su014,post,Th17,1.642686,-5.324915,Tumor
TTTGCGCTCAAACCGT,su014,post,Th17,2.333773,-5.540516,Tumor
TTTGGTTTCTTTACGT,su014,post,CD8_mem,-5.674574,-0.213086,Tumor


In [7]:
GSE123813.obs.rename(columns={"patient": "Patient"}, inplace=True)

In [8]:
GSE123813.obs['Patient']

AAACCTGCATGTTCCC    su010
AAACGGGAGGACAGAA    su010
AAACGGGCAAGTCTAC    su010
AAACGGGGTAGTACCT    su010
AAACGGGGTGATAAAC    su010
                    ...  
TTTACTGTCACTCCTG    su014
TTTCCTCCATCGGAAG    su014
TTTGCGCTCAAACCGT    su014
TTTGGTTTCTTTACGT    su014
TTTGTCATCATGCAAC    su014
Name: Patient, Length: 26016, dtype: category
Categories (4, object): ['su010', 'su011', 'su013', 'su014']

In [9]:
GSE123813.write_h5ad("Anndata/GSE123813.h5ad") 