In [2]:
import pandas as pd 
import re 
import matplotlib.pyplot as plt 
import seaborn as sns
import os 

In [3]:
print("pandas:", pd.__version__)
print("re: built-in module (no version)")
print("matplotlib:", plt.matplotlib.__version__)
print("seaborn:", sns.__version__)
print("os: built-in module (no version)")

pandas: 2.2.2
re: built-in module (no version)
matplotlib: 3.9.2
seaborn: 0.13.2
os: built-in module (no version)


## Load all raw datasets 

In [None]:
# load all file types with profiling data
#damaging mutations matrix 
base_dir = os.path.join("..", "raw")  # Use relative path to raw_data directory
damaging_mutations_df = pd.read_csv(os.path.join(base_dir, "OmicsSomaticMutationsMatrixDamaging.csv"),index_col = 0 )
#omics profiles 
omics_profiles = pd.read_csv(os.path.join(base_dir, "OmicsProfiles.csv"), index_col=0)
#omics expresion data 
omics_expression = pd.read_csv(os.path.join(base_dir, "OmicsExpressionProteinCodingGenesTPMLogp1.csv"), index_col=0)
#omics copy number variation data 
omics_cnv = pd.read_csv(os.path.join(base_dir, "OmicsCNGeneWGS.csv"), index_col=0)
#estimated gene dependency probability based on CRISPR data 
CRISPR_dependency = pd.read_csv(os.path.join(base_dir, "CRISPRGeneDependency.csv"), index_col=0)

## confirm cell line present in all data types

In [None]:
cell_line_name = "FADU" 
#check with omics profiles first 
match = omics_profiles[omics_profiles["StrippedCellLineName"].str.lower()== cell_line_name.lower()]


if match.empty: 
    print(f"Cell line '{cell_line_name}' not found in OmicsProfiles.")
else: 
    model_id = match.index[0]  # ModelID is the index, not a column
    profile_row = match.index[0]
    print(f"Found '{cell_line_name}' cell line, model ID: {model_id} (row {profile_row})")
    # check with damaging mutations matrix 
    if model_id in damaging_mutations_df.index: 
        row_mut = damaging_mutations_df.index.get_loc(model_id) 
        print(f"{cell_line_name} cell line (Model ID: {model_id}) present in mutations matrix at row {row_mut} ") 
    else: 
        print(f"{cell_line_name} not found in mutation matrix") 
    # check with CRISPR dependency 
    if model_id in CRISPR_dependency.index: 
        row_dep = CRISPR_dependency.index.get_loc(model_id) 
        print(f"{cell_line_name} cell line (Model ID: {model_id}) present in CRISPR dependencies at row {row_dep} ") 
    else: 
        print(f"{cell_line_name} not found in CRISPR dependency data") 
    # check with CNV
    if model_id in omics_cnv.index: 
        row_cnv = omics_cnv.index.get_loc(model_id) 
        print(f"{cell_line_name} cell line (Model ID: {model_id}) present in omics CNV at row {row_cnv} ") 
    else: 
        print(f"{cell_line_name} not found in omics CNV data") 
    #check with expression 
    if model_id in omics_expression.index: 
        row_exp = omics_expression.index.get_loc(model_id) 
        print(f"{cell_line_name} cell line (Model ID: {model_id}) present in omics expressions at row {row_exp} ") 
    else: 
        print(f"{cell_line_name} not found in expression data ")

Found 'FADU' cell line, model ID: ACH-000846 (row ACH-000846)
FADU cell line (Model ID: ACH-000846) present in mutations matrix at row 739 
FADU cell line (Model ID: ACH-000846) present in CRISPR dependencies at row 568 
FADU cell line (Model ID: ACH-000846) present in omics CNV at row 486 
FADU cell line (Model ID: ACH-000846) present in omics expressions at row 941 


## CRISPR Gene Dependency data analysis 

In [None]:
# plot the CRISPR dependency data 
# transpose the CRISPR dependency data for easier analysis
CRISPR_depndency_t = CRISPR_dependency.T 
# number of genes in each cell line passing a certain threshold    
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
threshold_summary = {}
colors = ['orange', 'blue', 'green', 'red', 'purple']
for i, threshold in enumerate(thresholds): 
    genes_above_threshold = (CRISPR_depndency_t >= threshold).sum(axis=0).sort_values()
    threshold_summary[threshold] = genes_above_threshold
    thredshold_summary_df = pd.DataFrame(threshold_summary)
    #summary of number of genes above threshold for each cell line
    summary_stats = thredshold_summary_df.describe(percentiles=[0.25, 0.5, 0.75])
    print(f"Summary statistics for threshold {threshold}:")
    print(summary_stats[threshold])
    print(f"Number of genes above threshold {threshold} in each cell line:")
    print(genes_above_threshold.head(5))
    # plot number of genes above threshold for each cell line 
    plt.figure(figsize=(10, 6))
    ax = sns.histplot(genes_above_threshold, bins=20, kde=True, color=colors[i % len(colors)])
    plt.title(f"Number of genes above threshold {threshold} per cell line")
    plt.xlabel("Number of genes")
    plt.ylabel("Number of cell lines")
    for patch in ax.patches:
        count = int(patch.get_height())
        if count > 0: 
            ax.annotate(f"{count}", 
                        (patch.get_x() + patch.get_width() / 2, patch.get_height()), 
                        ha='center', va='bottom')
    plt.show()


In [None]:
# plotting the CRISPR dependency data for a specific cell line
# get specific cell line gene dependency data summary
cell_line_data = CRISPR_dependency.loc[model_id]
print(f"Summary statistics for {cell_line_name} cell line CRISPR dependency probabilities:")
print(cell_line_data.describe())
# get the genes above the threshold for the specific cell line
threshold = 0.5
cell_line_genes_above_threshold = genes_above_threshold[cell_line_data.name]
print(f"genes in {cell_line_name} passing the {threshold} gene dependency threshold: {cell_line_genes_above_threshold}")
# make it for multiple thresholds
#bucket counts 
bin_counts=pd.cut(cell_line_data, bins=[0, 0.25, 0.5,0.7, 0.9, 1.0], labels=["0-0.25", "0.25-0.5", "0.5-0.7", "0.7-0.9", "0.9-1.0"]).value_counts().sort_index()
print(f"Dependency score distribution for {cell_line_name} cell line:")
print(bin_counts)
# plot distirbution for the selected cell line 
plt.figure(figsize=(10, 6))
ax = sns.histplot(cell_line_data, bins=20, kde=True, color ='orange')
for pat in ax.patches:
    if pat.get_height() > 0:
        ax.annotate(f'{int(pat.get_height())}', 
                    (pat.get_x() + pat.get_width() / 2, pat.get_height()), 
                    ha='center', va='bottom', fontsize=10, color='black') 
plt.title(f"CRISPR Dependecy Probabilities Distribution for {cell_line_name} cell line")
plt.xlabel("CRISPR Dependency Probability")
plt.ylabel("Number of Genes")


## UniProt Mapping Process (includes both the Gene Index number attempt and Gene Symbol pipeline)

In [None]:
# pipeline for making the prize file 
# get the gene symbols from damaging mutations matrix as input file for Uniprot web service 
# use gene numbers instead of gene symbols 
preprocessed_dir = os.path.join("..", "processed", "gene_index_mapping_attempt") 
os.makedirs(preprocessed_dir, exist_ok=True)
gene_columns = damaging_mutations_df.columns.tolist()[1:]
gene_numbers = [re.search(r'\(([^)]+)\)', col).group(1) for col in gene_columns if isinstance(col, str) and "unknown" not in col.lower() and re.search(r'\(([^)]+)\)', col)]   
#get rid of unknown gene numbers saved as ""s 
genes_numbers_df = pd.DataFrame(gene_numbers, columns=["GeneNumber"])
#save the gene numbers to a csv file for uniprot mapping
gene_columns_df = pd.DataFrame(gene_columns, columns=["GeneColumns"])
gene_columns_df.to_csv(os.path.join(preprocessed_dir, "gene_columns.txt"), index=False, sep='\t', header=True)
genes_numbers_df.to_csv(os.path.join(preprocessed_dir, "gene_numbers.txt"), index=False, sep='\t', header=True)

In [135]:
#mapping task ran on uniprot web service 7/24/2025, from GeneID to UniprotKB 
raw_id_mapping = pd.read_csv(os.path.join(preprocessed_dir, "raw_uniprot_idmapping_2025_07_24.tsv"), sep='\t')
raw_id_mapping
# multiply mappings for the same gene index number - duplicates are unreviewed, use reviewed only
id_mapping = raw_id_mapping[raw_id_mapping["Reviewed"] == "reviewed"]
id_mapping
# check for reviewed unique mappings 
unique_mappings = id_mapping['From'].unique()
print(f"Number of unique reviewed mappings: {len(unique_mappings)}")   
id_mapping.to_csv(os.path.join(preprocessed_dir, "reviewed_id_mapping_2025_07_24.tsv"), sep='\t', index=False, header=True)


Number of unique reviewed mappings: 17498


In [None]:
#check for duplicates 
duplicate_ids = id_mapping['From'].value_counts()   
duplicate_ids = duplicate_ids[duplicate_ids > 1]
duplicated_entries = id_mapping[id_mapping['From'].isin(duplicate_ids.index)]
unique_duplicates = duplicated_entries['From'].unique()
print(f"Number of unique reviewed mappings: {len(unique_duplicates)}")   
# duplicated_entries.to_csv(os.path.join(preprocessed_dir, "duplicated_mapping_entries.tsv"), sep='\t', index=False)
duplicated_entries
#need further work to go with gene index number mapping 

In [None]:
# pipeline for mapping and making prize file using Gene Symbols instead 
# get the gene symbols from damaging mutations matrix as input file for Uniprot web service
gene_columns = damaging_mutations_df.columns.tolist()[1:]
gene_symbols = [re.match(r"^(.*?) \(", col).group(1) if " (" in col else col for col in gene_columns]
genes_df = pd.DataFrame(gene_symbols, columns=["GeneSymbol"])
print(genes_df.head())
# genes_df.to_csv("/Users/gracel/Desktop/DepMap/DamamingMutationsGeneSymbols_20250718.csv", index=False)

In [None]:
uniprot_map = pd.read_csv(os.path.join("..", "processed", "DamagingMutations_idMapping_20250718.tsv"), sep='\t')
# mapping obtained from Uniprot web service 7/18/2025, from GeneSymbol to UniprotKB

## Preparing Prize input file 

In [None]:
# extract mutation data for the cell line 
mutation_row = damaging_mutations_df.loc[model_id]
print(f"Summary of damaging mutations data for {cell_line_name} cell line (Model ID: {model_id}):")
print(mutation_row.describe())
print(mutation_row.value_counts().sort_index()  )
# mapping gene symbols to Uniprot IDs 
gene_to_uniprot = dict(zip(uniprot_map["From"], uniprot_map["Entry Name"]))
rows = []
for col, score in mutation_row.items(): 
    #extract gene symbols for mapping
    match = re.match(r"^(.*?) \(", col)
    gene_symbol = match.group(1) if match else col
    # only map for gene symbols in uniprot map 
    if gene_symbol in gene_to_uniprot:
        uniprot_id = gene_to_uniprot[gene_symbol]
        rows.append([gene_symbol, uniprot_id, score])
mapped_prizes_df = pd.DataFrame(rows, columns=["GeneSymbol", "UniprotID", "Prize"])

In [None]:
prizes_input_file = mapped_prizes_df[mapped_prizes_df.columns[1:]].rename(columns={"UniprotID": "NODEID", "Prize":"prize"})
# save the prizes input as txt for spras 
output_path = os.path.join("..", "processed", f"{cell_line_name}_cell_line_prizes_input.txt")
prizes_input_file.to_csv(output_path, sep='\t', index=False, header=True)
print(f"Prize file saved for cell line '{cell_line_name}' at: {output_path}")

Prize file saved for cell line 'FADU' at: C:\Users\gli2\Desktop\DepMap\FADU_cell_line_prizes_input.txt


In [None]:
nonzero_prizes_input_file = prizes_input_file[prizes_input_file["prize"] > 0]
nonzero_prizes_input_file
nonzero_output_path = os.path.join("..", "processed", f"{cell_line_name}_cell_line_prizes_input_nonzero.txt")
nonzero_prizes_input_file.to_csv(nonzero_output_path, sep='\t', index=False, header=True)
print(f"Prize file saved for cell line '{cell_line_name}' at: {nonzero_output_path}")
nonzero_prizes_input_file

Prize file saved for cell line 'FADU' at: C:\Users\gli2\Desktop\DepMap\FADU_nonzero_cell_line_prizes_input.txt


Unnamed: 0,NODEID,prize
301,PDK1L_HUMAN,1.0
682,EFCB7_HUMAN,1.0
825,GDE_HUMAN,1.0
1255,IGSF8_HUMAN,1.0
1394,ASTN1_HUMAN,1.0
1939,MGLYR_HUMAN,1.0
2039,PHIPL_HUMAN,1.0
2210,CPEB3_HUMAN,1.0
2658,ACD_HUMAN,1.0
2778,ANO3_HUMAN,1.0


## Making Gold standard file for the selected cel line based on gene dependency 0.5 cutoff 

In [None]:
# map Uniprot IDs to gene symbols in the CRISPR dependency data 
cell_line_dependency = CRISPR_dependency.loc[model_id]
filtered_dependency = cell_line_dependency[cell_line_dependency > 0.5]
mapped_dependency = [] 
for gene, dependency in filtered_dependency.items():
    match = re.match(r"^(.*?) \(", gene)
    gene_symbol = match.group(1) if match else gene
    if gene_symbol in gene_to_uniprot:
        uniprot_id = gene_to_uniprot[gene_symbol]
        mapped_dependency.append([gene_symbol, uniprot_id, dependency])
mapped_dependency_df = pd.DataFrame(mapped_dependency, columns=["GeneSymbol", "UniprotID", "Dependency"])
print(f"Mapped CRISPR dependency data for cell line '{cell_line_name}':")
# save mapped dependency as gold standard file 
gold_standard = mapped_dependency_df[mapped_dependency_df.columns[1]]
gold_standard_output_path = os.path.join("..", "processed", f"{cell_line_name}_gold_standard.txt")
gold_standard.to_csv(gold_standard_output_path, sep='\t', index=False, header=False)

Mapped CRISPR dependency data for cell line 'FADU':


0        AAMP_HUMAN
1        SYAC_HUMAN
2        SYAM_HUMAN
3        AATF_HUMAN
4       ABCB7_HUMAN
           ...     
1609    ZN830_HUMAN
1610    ZNHI2_HUMAN
1611     BCD1_HUMAN
1612     ZPR1_HUMAN
1613    U2AFM_HUMAN
Name: UniprotID, Length: 1614, dtype: object