In [1]:
# read in the 01_lukassen_extracted/lukassen_supp_table_s2_full.csv
import pandas as pd

df = pd.read_csv("01_lukassen_extracted/lukassen_supp_table_s2_full.csv")

# print the first few rows of the dataframe
print(df.head())

  Gene name  Early Sgonia  Late Sgonia  Early Scytes  Late Scytes  \
0    Zbtb16             1          0.0           0.0          0.0   
1     Sycp3             1          1.0           1.0          1.0   
2     Pygo2             1          1.0           0.0          0.0   
3      Etv5             1          0.0           0.0          0.0   
4       Bsg             0          0.0           1.0          1.0   

   Round Stids  Later Stids  Sertoli  Leydig   Method Source  
0          0.0          0.0      0.0     0.0  In situ   1 ,2  
1          1.0          1.0      NaN     NaN     qPCR      3  
2          0.0          1.0      1.0     1.0      IHC      4  
3          0.0          0.0      1.0     0.0      IHC      5  
4          1.0          1.0      0.0     0.0  In situ      6  


In [2]:
# create a json file with the dataframe, while a column is a key and the gene names are the values if the gene is =1.0 for that column
import json

# Create a dictionary where each column name maps to a list of gene names where the value is 1.0
gene_markers = {col: df[df[col] == 1.0]['Gene name'].tolist() for col in df.columns}

#remove the column "Gene name" from the dictionary
gene_markers.pop("Gene name")
gene_markers.pop("Method")
gene_markers.pop("Source")

print(gene_markers.keys())

# make all values capital
for key in gene_markers:
    gene_markers[key] = [gene.upper() for gene in gene_markers[key]]

# save the dictionary to a json file
with open('02_mapped_gene_markers/all_gene_markers.json', 'w') as f:
    json.dump(gene_markers, f)

# Iterate over all rows in the df and calculate the number of 1.0 values (excluding columns "Gene name", "Method", "Source")
# If the count is greater than 1, add the gene name to the multiple_genes list

exclude_columns = ["Gene name", "Method", "Source"]
marker_columns = [col for col in df.columns if col not in exclude_columns]

# create a df multiple_genes
multiple_genes_list = []
for _, row in df.iterrows():
    count_ones = (row[marker_columns] == 1.0).sum()
    if count_ones > 1:
        multiple_genes_list.append({"Gene name": row['Gene name'], "Count": count_ones, "Type": "multiple"})
    elif count_ones == 1:
        multiple_genes_list.append({"Gene name": row['Gene name'], "Count": count_ones, "Type": "exclusive"})
    else:
        multiple_genes_list.append({"Gene name": row['Gene name'], "Count": count_ones, "Type": "none"})

multiple_genes = pd.DataFrame(multiple_genes_list)
print(multiple_genes.head())

print("\n")

# print len of Genes that are exclusive
print("Genes that are multiple:", len(multiple_genes[multiple_genes['Type'] == 'multiple']['Gene name']))
print("Genes that are exclusive:", len(multiple_genes[multiple_genes['Type'] == 'exclusive']['Gene name']))
print("Genes that are none:", len(multiple_genes[multiple_genes['Type'] == 'none']['Gene name']))
# print the none genes
print(multiple_genes[multiple_genes['Type'] == 'exclusive']['Gene name'])

dict_keys(['Early Sgonia', 'Late Sgonia', 'Early Scytes', 'Late Scytes', 'Round Stids', 'Later Stids', 'Sertoli', 'Leydig'])
  Gene name  Count       Type
0    Zbtb16      1  exclusive
1     Sycp3      6   multiple
2     Pygo2      5   multiple
3      Etv5      2   multiple
4       Bsg      4   multiple


Genes that are multiple: 253
Genes that are exclusive: 49
Genes that are none: 0
0             Zbtb16
7                Id4
17              Sox3
23             Kdm3a
46            Sohlh2
47             Epcam
50              Dmc1
52              Tnp1
55              Prm1
58            Sohlh1
65             Hils1
66             Hils1
70             Dyrk4
71            Pou5f1
78             Gpat2
84             Klf17
94            Hspa1l
107           Crisp2
161           Pou5f2
170              Wt1
171            Rhox5
172             Sox8
183             Sufu
194             Ngn3
204             Kitl
205             Kitl
206            Itga6
208            Itgb1
213            Uchl1
215

In [3]:
# generate an exclusive_gene_markers.json file mapping each cell type to its exclusive gene(s)
# Only allow exclusive genes that are assigned to a single cell type (no double assignments).

# First, find all genes that are exclusive (should appear exactly once among all cell types)
exclusive_genes = multiple_genes[multiple_genes['Type'] == 'exclusive']['Gene name']

# remove duplicates
exclusive_genes = pd.Series(list(set(exclusive_genes)))

# copy gene_markers but remove all values from all keys
exclusive_gene_markers = gene_markers.copy()
for key in exclusive_gene_markers:
    exclusive_gene_markers[key] = []

# iterate over exclusive_genes
for gene in exclusive_genes:
    print("gene: ", gene)
    # Filter all rows for this gene
    rows = df[df['Gene name'] == gene].drop(["Method", "Source"], axis=1)

    # If multiple rows for the gene exist, find the row with sum == 1.0 over marker columns (excluding 'Gene name')
    chosen_row = None
    for idx, row in rows.iterrows():
        marker_values = row.drop("Gene name")
        if marker_values.sum() == 1.0:
            chosen_row = row
            break

    if chosen_row is None:
        # In case there's only one row or none with exactly one assignment, check the first one
        if len(rows) == 1:
            chosen_row = rows.iloc[0]
        else:
            continue  # skip if not a valid exclusive match

    # Go over columns of the chosen_row and add the column name to the dict if value is 1.0 and it's a marker key
    for col in exclusive_gene_markers.keys():
        # Avoid "Gene name" col and only use marker columns
        if col == "Gene name":
            continue
        if pd.notnull(chosen_row[col]) and chosen_row[col] == 1.0:
            print("----", gene, "col: ", col)
            exclusive_gene_markers[col] = exclusive_gene_markers[col] + [gene]
# Save exclusive_gene_markers to json
#with open('02_mapped_gene_markers/exclusive_gene_markers.json', 'w') as f:
#    json.dump(exclusive_gene_markers, f)

gene:  Dmc1
---- Dmc1 col:  Early Scytes
gene:  Meiob
---- Meiob col:  Early Scytes
gene:  Gfra1
---- Gfra1 col:  Early Sgonia
gene:  Id4
---- Id4 col:  Early Sgonia
gene:  Sohlh1
---- Sohlh1 col:  Early Sgonia
gene:  Kdm3a
---- Kdm3a col:  Late Scytes
gene:  Slx
---- Slx col:  Round Stids
gene:  Sohlh2
---- Sohlh2 col:  Early Sgonia
gene:  Id3
---- Id3 col:  Sertoli
gene:  Cyp17a1
---- Cyp17a1 col:  Leydig
gene:  Pou5f2
---- Pou5f2 col:  Late Scytes
gene:  Sufu
---- Sufu col:  Later Stids
gene:  Sox8
---- Sox8 col:  Sertoli
gene:  Rhox5
---- Rhox5 col:  Sertoli
gene:  Dll1
---- Dll1 col:  Later Stids
gene:  Pou5f1
---- Pou5f1 col:  Early Sgonia
gene:  Hsd17b11
---- Hsd17b11 col:  Leydig
gene:  Tfcp2
---- Tfcp2 col:  Sertoli
gene:  Uchl1
---- Uchl1 col:  Early Sgonia
gene:  Wt1
---- Wt1 col:  Sertoli
gene:  Csf1
---- Csf1 col:  Leydig
gene:  Itgb1
---- Itgb1 col:  Early Sgonia
gene:  Hils1
---- Hils1 col:  Round Stids
gene:  Klf17
---- Klf17 col:  Round Stids
gene:  1700024P04Rik
---- 

In [4]:
# print len of all values of all keys in exclusive_gene_markers
total_len = 0
for key in exclusive_gene_markers.keys():
    temp = len(exclusive_gene_markers[key])
    if temp == 0:
        print("for key: ", key, "is empty")
    print(temp)
    total_len += temp
print("total length: ", total_len)

if total_len == len(exclusive_genes):
    print("total_len == len(exclusive_genes) -> PASST ALLES")

16
for key:  Late Sgonia is empty
0
3
2
5
7
7
5
total length:  45
total_len == len(exclusive_genes) -> PASST ALLES


## problem is that the late sgonia key is empty. so there wasnt any gene that was only occuring in late sgonia and so to say a 100% unique marker gene for that cell population. So i searched all genes that appear in late sgonia and add them to the key if they arent already appearing in another key, to not create any inteferences.

In [5]:
# add genes to the key "Late Sgonia" in exclusive_gene_markers if they are in this column in the df and these genes are not already in another key
late_sgs = df[df['Late Sgonia']==1.0]

#remove all genes that are appearing in sum in more than 3 cell types
for id, row in late_sgs.iterrows():
    count_ones = (row[marker_columns] == 1.0).sum()
    if count_ones > 3:
        late_sgs = late_sgs.drop(id)

# make gene names unique in late_sgs by using the "Gene name" column
late_sgs = late_sgs.drop_duplicates(subset=['Gene name'])

# remove all genes that are appearing in sum in more than 3 cell types
print("total of ", len(late_sgs), " genes appearing in Late Sgonia and less/equal than 3 other cell types, we dont want to check for genes that are all over in all celltypes")

print("iterating over all late_sgonia genes and add them to the Late Sgonia key in exclusive_gene_markers if they are not already in another key")
for _, row in late_sgs.iterrows():
    gene = row['Gene name']
    if gene not in exclusive_gene_markers.values():
        print("gene: ", gene)
        exclusive_gene_markers['Late Sgonia'].append(gene)


print("added final of ", len(exclusive_gene_markers['Late Sgonia']), " genes to the Late Sgonia key")

# make all values capital
for key in exclusive_gene_markers:
    exclusive_gene_markers[key] = [gene.upper() for gene in exclusive_gene_markers[key]]

# save the exclusive_gene_markers to a json file
with open('02_mapped_gene_markers/exclusive_gene_markers.json', 'w') as f:
    json.dump(exclusive_gene_markers, f)

total of  27  genes appearing in Late Sgonia and less/equal than 3 other cell types, we dont want to check for genes that are all over in all celltypes
iterating over all late_sgonia genes and add them to the Late Sgonia key in exclusive_gene_markers if they are not already in another key
gene:  Dazl
gene:  Rad51
gene:  Stra8
gene:  Hist1h1a
gene:  Pcna
gene:  Tex11
gene:  Nanos3
gene:  E2f1
gene:  Hist1h3a
gene:  Wsb2
gene:  Esx1
gene:  Usp26
gene:  Sall4
gene:  Nlrp4c
gene:  Ccna2
gene:  Ccne2
gene:  Ccnd1
gene:  Crabp1
gene:  Esrp1
gene:  Adgra3
gene:  Cdkn1c
gene:  Per1
gene:  Lin28a
gene:  Figla
gene:  Utf1
gene:  Hist1h4m
gene:  Fthl17
added final of  27  genes to the Late Sgonia key
