## Calculate the % of shared mutations between GBC and OL

### Read in vcf files

In [1]:
import pandas as pd
import glob
import os

def read_all_txt_files_in_folder(folder_path):
    txt_files = glob.glob(folder_path + "/*.vcf")
    dfs = {}
    for txt_file in txt_files:
        file_name = os.path.splitext(os.path.basename(txt_file))[0]
        df = pd.read_csv(txt_file, sep='\t')
        dfs[file_name] = df
    return dfs


folder_path_GBC = '/Users/tingyang/Desktop/LAB/ProgressReport/2023/PCGA/GBC_OL/Shared_mutation/GBC/'
folder_path_OL = '/Users/tingyang/Desktop/LAB/ProgressReport/2023/PCGA/GBC_OL/Shared_mutation/OL/'

GBC_dict = read_all_txt_files_in_folder(folder_path_GBC)
OL_dict = read_all_txt_files_in_folder(folder_path_OL)


In [2]:
keys_list = list(GBC_dict.keys())
print(keys_list)

['RADS29_final', 'RADS47_final', 'RADS17_final', 'RADS6_final', 'RADS36_final', 'RADS10_final', 'RADS52_final', 'RADS28_final', 'RADS7_final', 'RADS37_final', 'RADS23_final', 'RADS40_final', 'RADS13_final', 'RADS2_final', 'RADS45_final', 'RADS4_final', 'RADS34_final', 'RADS43_final', 'RADS39_final', 'RADS18_final', 'RADS33_final', 'RADS3_final', 'RADS27_final', 'RADS44_final', 'RADS5_final', 'RADS8_final', 'RADS42_final', 'RADS19_final']


### Plot the Venn diagrams

In [3]:
import pandas as pd
from matplotlib_venn import venn2
import matplotlib.pyplot as plt

# Define a function to convert a data frame to a set of tuples (rows)
def df_to_set(df):
    return set(tuple(x) for x in df.to_records(index=False))

# Define a function to count the number of rows that are exactly the same between two data frames
def count_exact_matches(df1, df2):
    set_df1 = df_to_set(df1)
    set_df2 = df_to_set(df2)
    return len(set_df1.intersection(set_df2))


In [None]:
# Iterate over the keys and count exact matches and individual data frame rows
for key in keys_list:
    # Get the data frames for the current key
    df_GBC = GBC_dict[key]
    df_OL = OL_dict[key]

    # Count the number of rows that are exactly the same
    counts_same = count_exact_matches(df_GBC, df_OL)

    # Count the number of rows in each data frame
    counts_GBC = len(df_GBC)
    counts_OL = len(df_OL)

    # Draw the Venn diagram
    set_GBC = df_to_set(df_GBC)
    set_OL = df_to_set(df_OL)
    venn = venn2([set_GBC, set_OL], set_labels=('GBC', 'OL'))
    plt.title(f"GBC {key} vs. OL {key}")

    # Save the Venn diagram as PNG with a unique filename
    filename = f"{key}_common_mut.png"
    plt.savefig(filename)

    plt.show()

### Test on single sample

In [None]:
GBC_dict['RADS36_final'].shape

In [None]:
OL_dict['RADS36_final'].shape

In [None]:
set_df1 = set(tuple(x) for x in GBC_dict['RADS36_final'].to_records(index=False))
set_df2 = set(tuple(x) for x in OL_dict['RADS36_final'].to_records(index=False))
len(set_df1.intersection(set_df2))

In [None]:
def print_common_rows(df1, df2):
    # Convert data frames to sets of tuples (rows)
    set_df1 = set(tuple(x) for x in df1.to_records(index=False))
    set_df2 = set(tuple(x) for x in df2.to_records(index=False))
    
    # Find the common rows
    common_rows = set_df1.intersection(set_df2)
    
    # Print the common rows without headers
    print("Common Rows:")
    for row in common_rows:
        print(row)

In [None]:
print_common_rows(GBC_dict['RADS36_final'], OL_dict['RADS36_final'])

## Checking if the common mutations are the driver genes identified by IntOGen 
### Will need to run IntOGen beforhand and get the driver genes

In [4]:
# Define the drivers interval ranges for each condition
interval_drivers = {
    'chr2': (201266487, 201286594),
    'chr4': (186588592, 186709827),
    'chr9': (136496071, 136545786),
    'chr10': (92052212, 92240350),
    'chr4': (152322881, 152382335),
    'chr6': (31354483, 31357158),
    'chr11': (532636, 534322),
    'chr3': (179198826, 179234364),
    'chr17': (7673219, 7676251),
}

In [5]:
set_df1 = set(tuple(x) for x in GBC_dict['RADS36_final'].to_records(index=False))
set_df2 = set(tuple(x) for x in OL_dict['RADS36_final'].to_records(index=False))

In [6]:
# Function to check if a row satisfies any of the criteria
def satisfies_any_criteria(row):
    chrom, pos = row[0], row[1]
    if chrom in interval_drivers:
        start_interval, end_interval = interval_drivers[chrom]
        return start_interval <= pos <= end_interval
    return False

In [7]:
# Define a function to get the rows that are exactly the same between two data frames
def get_exact_matches(df1, df2):
    set_df1 = df_to_set(df1)
    set_df2 = df_to_set(df2)
    return set_df1.intersection(set_df2)

In [8]:
# Iterate over the keys and count exact matches and individual data frame rows
for key in keys_list:
    # Get the data frames for the current key
    df_GBC = GBC_dict[key]
    df_OL = OL_dict[key]
    common_mut = get_exact_matches(df_GBC, df_OL)
    for row in common_mut:
        if satisfies_any_criteria(row):
            print("Common mutations that are also driver genes:",key, row)
    

Common mutations that are also driver genes: RADS10_final ('chr2', 201285307, '.', 'C', 'T', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS37_final ('chr2', 201276864, '.', 'G', 'A', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS37_final ('chr3', 179234297, '.', 'A', 'G', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS23_final ('chr2', 201276864, '.', 'G', 'A', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS23_final ('chr11', 533873, '.', 'C', 'G', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS23_final ('chr17', 7673779, '.', 'C', 'G', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS13_final ('chr2', 201285237, '.', 'CTG', 'C', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS4_final ('chr2', 201272722, '.', 'C', 'T', '.', 'PASS', '.', '.')
Common mutations that are also driver genes: RADS3_final ('chr2', 20128648