For each of the 247 genes in PPI_1percent_Gene_partners_Final_13042025.xlsx (generated in Maxime_PPI_1percent_list_13042025.ipynb):

- Extract that gene + all its interactors.
- Save their expression data (pulled from the big CellxGene Excel files: A_genes_CELLxGENE_...xlsx to Z_genes_...) into a separate file:
      -e.g., ACSF3_PPI.xlsx, MLH1_PPI.xlsx

In [1]:
import os
import pandas as pd

# === PATH SETUP ===

#Define input & output directory
input_dir = "./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/"
ppi_mapping_file = "./results_11032025/Jess_PPI_21032025/PPI_1percent_Gene_partners_Final_13042025.xlsx"
output_dir = "./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level2"

os.makedirs(output_dir, exist_ok=True)

In [2]:
# Collect all relevant files
input_files = sorted([f for f in os.listdir(input_dir) if f.endswith(".csv") and "CELLxGENE" in f])

#=== STEP 1: MERGE ALL EXPRESSION FILES (.xlsx from A-Z folders) ===
#Container to hold merged unique (Gene Symbol, Tissue, Cell Type) entries

all_dfs = [] 

#**Loop through all CSV files in input_dir**
for file in sorted(os.listdir(input_dir)):
    if file.endswith(".csv") and "CELLxGENE" in file:
        file_path = os.path.join(input_dir, file)
        print(f"📂 Reading: {file_path}")
        try:
            df = pd.read_csv(file_path)
            all_dfs.append(df)
        except Exception as e:
            print(f"⚠️ Failed to read {file_path}: {e}")

#**Check if any files were loaded**
if not all_dfs:
    raise ValueError("❌ No CSV files were read from the input directory.")

#**Merge all into one DataFrame**
merged_expr_df = pd.concat(all_dfs, ignore_index=True)
print(f"📊 Merged total rows: {merged_expr_df.shape[0]}")

#=== STEP 2: DROP REDUNDANT GENE + TISSUE + CELL TYPE ENTRIES ===
merged_expr_df = merged_expr_df.drop_duplicates(subset=["Gene Symbol", "Tissue", "Cell Type"])
print(f"✅ Non-redundant merged rows: {merged_expr_df.shape[0]}")

📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/A_genes_CELLxGENE_gene_expression_041625.csv
📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/B_genes_CELLxGENE_gene_expression_041625.csv
📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/C_genes_CELLxGENE_gene_expression_041625.csv
📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/D_genes_CELLxGENE_gene_expression_041625.csv
📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/E_genes_CELLxGENE_gene_expression_041625.csv
📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/F_genes_CELLxGENE_gene_expression_041625.csv
📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/G_genes_CELLxGENE_gene_expression_041625.csv
📂 Reading: ./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level1/H_genes_CELLxGENE_gene_expression_041

In [3]:
#Correct file name listing Gene+ Interactors (PPI file corrected by Maxime) 
ppi_mapping_file = "./results_11032025/Jess_PPI_21032025/PPI_1percent_Gene_partners_Final_13042025.xlsx"

In [4]:
# === STEP 2: LOAD GENE ↔ INTERACTORS MAPPING ===
ppi_map = pd.read_excel(ppi_mapping_file)
ppi_map.dropna(subset=["Gene", "Interactors"], inplace=True)
ppi_map.head()

Unnamed: 0,Gene,Interactors
0,AC012254.2,"GPR42, TMEM237, PIGP, TMEM86B, HERPUD2, TMEM80..."
1,ACSF3,"RAB28, TRIM27, KRT40"
2,ACTB,"ACTB, CFL2, ACTG1"
3,ACY1,ACY1
4,ADIPOQ,"PVR, TMEM237, BIK, BCL2L13, CLDN9, NEMP1, GPR4..."


Within each (Tissue, Cell Type) group:

- Only include groups where POI is present.
- Sort the genes so that POI comes first, followed by interactors.

In [5]:
#=== STEP 3: EXTRACT & SAVE PER GENE ===
for idx, row in ppi_map.iterrows():
    gene = row["Gene"].strip()
    interactors = [g.strip() for g in row["Interactors"].split(",") if g.strip()]
    query_genes = [gene] + interactors

    #Filter from merged expression DataFrame
    subset_df = merged_expr_df[merged_expr_df["Gene Symbol"].isin(query_genes)].copy()

    #Identify (Tissue, Cell Type) pairs where POI is present
    valid_clusters = subset_df[subset_df["Gene Symbol"] == gene][["Tissue", "Cell Type"]].drop_duplicates()

    if valid_clusters.empty:
        print(f"⚠️ Skipping {gene}: POI not found in any tissue × cell type combination.")
        continue

    #Filter only rows from valid clusters
    filtered_df = subset_df.merge(valid_clusters, on=["Tissue", "Cell Type"], how="inner")

    #Sort: POI first, then interactors within each (Tissue, Cell Type)
    filtered_df["Symbol_Priority"] = (filtered_df["Gene Symbol"] != gene).astype(int)
    filtered_df.sort_values(by=["Tissue", "Cell Type", "Symbol_Priority", "Gene Symbol"], inplace=True)
    filtered_df.drop(columns=["Symbol_Priority"], inplace=True)

    if not filtered_df.empty:
        output_path = os.path.join(output_dir, f"{gene}_PPI.xlsx")
        filtered_df.to_excel(output_path, index=False)
        print(f"✅ Saved: {gene}_PPI.xlsx with {filtered_df.shape[0]} rows")
    else:
        print(f"⚠️ No data found for: {gene} + partners")

#Looks correct! 

⚠️ Skipping AC012254.2: POI not found in any tissue × cell type combination.
✅ Saved: ACSF3_PPI.xlsx with 13484 rows
✅ Saved: ACTB_PPI.xlsx with 10113 rows
✅ Saved: ACY1_PPI.xlsx with 3371 rows
✅ Saved: ADIPOQ_PPI.xlsx with 70791 rows
✅ Saved: AGXT_PPI.xlsx with 30339 rows
✅ Saved: AHCY_PPI.xlsx with 6742 rows
✅ Saved: AIPL1_PPI.xlsx with 6742 rows
✅ Saved: ALAS2_PPI.xlsx with 6742 rows
✅ Saved: ALDOA_PPI.xlsx with 3371 rows
✅ Saved: ALOX5_PPI.xlsx with 13484 rows
✅ Saved: AMPD2_PPI.xlsx with 6742 rows
✅ Saved: ANKRD1_PPI.xlsx with 10113 rows
✅ Saved: ANXA11_PPI.xlsx with 6742 rows
✅ Saved: AP2S1_PPI.xlsx with 6742 rows
✅ Saved: APOA1_PPI.xlsx with 6742 rows
✅ Saved: APOD_PPI.xlsx with 53936 rows
✅ Saved: ASNS_PPI.xlsx with 6742 rows
✅ Saved: ATPAF2_PPI.xlsx with 30339 rows
✅ Saved: BAG3_PPI.xlsx with 70791 rows
✅ Saved: BANF1_PPI.xlsx with 6742 rows
✅ Saved: BCL10_PPI.xlsx with 10113 rows
✅ Saved: BFSP2_PPI.xlsx with 74162 rows
✅ Saved: BLK_PPI.xlsx with 60678 rows
✅ Saved: C1QA_PPI.x

In [6]:
file = "./results_11032025/Jess_PPI_21032025/PPI_preprocessed_15042025/Level2/ACSF3_PPI.xlsx"
df = pd.read_excel(file)

#Test this explicitly
print("Gene Symbol values:", df["Gene Symbol"].unique())
print("Looking for POI:", "ACSF3" in df["Gene Symbol"].unique())

Gene Symbol values: ['ACSF3' 'KRT40' 'RAB28' 'TRIM27']
Looking for POI: True


In [7]:
df.columns

Index(['Tissue', 'Cell Type', 'Cell Count', 'Tissue Composition',
       'Gene Symbol', 'Expression', 'Expression, Scaled',
       'Number of Cells Expressing Genes'],
      dtype='object')