In [2]:
import scanpy as sc
import pandas as pd
import numpy as np

In [3]:
male = sc.read_h5ad("data/mouse_male_gonadal.h5ad")
female = sc.read_h5ad("data/mouse_female_gonadal.h5ad")

In [11]:
np.unique(male.obs['development_stage'])

array(['Theiler stage 19', 'Theiler stage 20', 'Theiler stage 21'],
      dtype=object)

In [None]:
# male.X = male.raw.X
# male.var_names = male.raw.var_names

# female.X = female.raw.X
# female.var_names = female.raw.var_names

In [10]:
print("Male dataset:")
print("Shape:", male.shape)
print("Data type:", male.X.dtype)
print("Min/Max values:", np.min(male.X), np.max(male.X))
print("")

print("Female dataset:")
print("Shape:", female.shape)
print("Data type:", female.X.dtype)
print("Min/Max values:", np.min(female.X), np.max(female.X))

Male dataset:
Shape: (32889, 24400)
Data type: float32
Min/Max values: 0.0 16701.0

Female dataset:
Shape: (69709, 25652)
Data type: float32
Min/Max values: 0.0 16526.0


In [11]:
sc.pp.filter_cells(male, min_genes=200)
sc.pp.filter_genes(male, min_cells=3)

sc.pp.filter_cells(female, min_genes=200)
sc.pp.filter_genes(female, min_cells=3)

In [None]:
sc.pp.normalize_total(male, target_sum=1e4)
sc.pp.log1p(male)

sc.pp.normalize_total(female, target_sum=1e4)
sc.pp.log1p(female)

In [13]:
male.var_names

Index(['ENSMUSG00000109644', 'ENSMUSG00000007777', 'ENSMUSG00000086714',
       'ENSMUSG00000043644', 'ENSMUSG00000042208', 'ENSMUSG00000020831',
       'ENSMUSG00000107002', 'ENSMUSG00000046683', 'ENSMUSG00000058706',
       'ENSMUSG00000099146',
       ...
       'ENSMUSG00000064354', 'ENSMUSG00000064358', 'ENSMUSG00000064370',
       'ENSMUSG00000064341', 'ENSMUSG00000064345', 'ENSMUSG00000064360',
       'ENSMUSG00000064363', 'ENSMUSG00000065947', 'ENSMUSG00000064367',
       'ENSMUSG00000064368'],
      dtype='object', length=23776)

In [14]:
female.var_names

Index(['ENSMUSG00000109644', 'ENSMUSG00000108652', 'ENSMUSG00000007777',
       'ENSMUSG00000086714', 'ENSMUSG00000043644', 'ENSMUSG00000042208',
       'ENSMUSG00000020831', 'ENSMUSG00000107002', 'ENSMUSG00000046683',
       'ENSMUSG00000058706',
       ...
       'ENSMUSG00000064354', 'ENSMUSG00000064358', 'ENSMUSG00000064370',
       'ENSMUSG00000064341', 'ENSMUSG00000064345', 'ENSMUSG00000064360',
       'ENSMUSG00000064363', 'ENSMUSG00000065947', 'ENSMUSG00000064367',
       'ENSMUSG00000064368'],
      dtype='object', length=25093)

In [15]:
common_genes = male.var_names.intersection(female.var_names)
male_data = male[:, common_genes].copy()
female_data = female[:, common_genes].copy()

In [16]:
# Filter for male data
cell_counts_male = male_data.obs['cell_type'].value_counts()
valid_cell_types_male = cell_counts_male[cell_counts_male >= 100].index
male_data = male_data[male_data.obs['cell_type'].isin(valid_cell_types_male), :]

# Filter for female data
cell_counts_female = female_data.obs['cell_type'].value_counts()
valid_cell_types_female = cell_counts_female[cell_counts_female >= 100].index
female_data = female_data[female_data.obs['cell_type'].isin(valid_cell_types_female), :]


In [17]:
common_cell_types = set(male_data.obs['cell_type']).intersection(female_data.obs['cell_type'])
male_data = male_data[male_data.obs['cell_type'].isin(common_cell_types), :]
female_data = female_data[female_data.obs['cell_type'].isin(common_cell_types), :]

In [18]:
# sc.pp.normalize_total(male_data, target_sum=1e4)
# sc.pp.log1p(male_data)

# sc.pp.normalize_total(female_data, target_sum=1e4)
# sc.pp.log1p(female_data)

In [19]:
sorted_genes = sorted(male_data.var_names)
male_data = male_data[:, sorted_genes]
female_data = female_data[:, sorted_genes]


In [20]:
male_df = pd.DataFrame(male_data.X.toarray(), index=male_data.obs_names, columns=sorted_genes)
female_df = pd.DataFrame(female_data.X.toarray(), index=female_data.obs_names, columns=sorted_genes)

In [21]:
print(male_df.shape)
print(female_df.shape)

(30784, 23333)
(33120, 23333)


In [22]:
male_data.obs['cell_type']

HCA_Mou_10827181_AAACCTGAGTAGATGT         mesenchymal cell
HCA_Mou_10827181_AAACCTGAGTCATGCT         mesenchymal cell
HCA_Mou_10827181_AAACCTGCAAGAGTCG          epithelial cell
HCA_Mou_10827181_AAACCTGCAGACACTT         mesenchymal cell
HCA_Mou_10827181_AAACCTGCATGCCTTC         mesenchymal cell
                                             ...          
HCA_Mou_10827186_TTTGTCAAGTGGAGAA         mesenchymal cell
HCA_Mou_10827186_TTTGTCACACCACCAG         mesenchymal cell
HCA_Mou_10827186_TTTGTCACACGTGAGA         mesenchymal cell
HCA_Mou_10827186_TTTGTCACATGACATC              neural cell
HCA_Mou_10827186_TTTGTCAGTAAGGGCT    skeletal muscle fiber
Name: cell_type, Length: 30784, dtype: category
Categories (8, object): ['epithelial cell', 'endothelial cell', 'erythrocyte', 'germ cell', 'supporting cell', 'neural cell', 'skeletal muscle fiber', 'mesenchymal cell']

In [23]:
female_data.obs['cell_type']

HCA_Mou_10828699_AAACCTGAGTGTTGAA-Sanger        supporting cell
HCA_Mou_10828699_AAACCTGAGTTACCCA-Sanger        supporting cell
HCA_Mou_10828699_AAACGGGCATCGTCGG-Sanger              germ cell
HCA_Mou_10828699_AAAGATGAGAAGGGTA-Sanger        supporting cell
HCA_Mou_10828699_AAAGATGCAATCACAC-Sanger        supporting cell
                                                     ...       
GSM4643738_P5ovary_TTATGCTAGGATGGAA-Niu2020    endothelial cell
GSM4643738_P5ovary_TTCTTAGCAGTAAGAT-Niu2020           germ cell
GSM4643738_P5ovary_TTGAACGAGAGCTTCT-Niu2020           germ cell
GSM4643738_P5ovary_TTGAACGCATGGTCTA-Niu2020    endothelial cell
GSM4643738_P5ovary_TTGTAGGAGGTACTCT-Niu2020    endothelial cell
Name: cell_type, Length: 33120, dtype: category
Categories (8, object): ['epithelial cell', 'endothelial cell', 'erythrocyte', 'germ cell', 'supporting cell', 'neural cell', 'skeletal muscle fiber', 'mesenchymal cell']

In [None]:
male_data.write_h5ad('cleaned_data/male_gonadal.h5ad')
female_data.write_h5ad('cleaned_data/female_gonadal.h5ad')