# Code to Load and Analyze the UMAP cooridnates Excel files 

In [None]:
# Loading libs
import pandas as pd
import numpy as np 
import os
from collections import defaultdict
import seaborn as sns

In [None]:
#EPIC v1 and EPIC v2 Samples that contain the same DNA 
parings = {"205756360010_R01C01":"207097420059_R02C01",
           "205705530027_R06C01":"207097420059_R03C01",
           "205707890147_R04C01":"207097420059_R04C01",
           "205751550059_R06C01":"207097420059_R05C01",
           "205800610010_R01C01":"207097420059_R06C01",
           "205800610132_R07C01":"207097420059_R07C01",
           "205800610140_R04C01":"207097420059_R08C01",
           "205800610140_R08C01":"207107850059_R01C01",
           "205800610140_R07C01":"207107850059_R02C01",
           "205800600144_R04C01":"207107850059_R03C01",
           "205800600144_R01C01":"207107850059_R04C01",
           "205800600144_R03C01":"207107850059_R05C01",
           "205809360101_R05C01":"207107850059_R06C01",
           "205921770008_R02C01":"207107850059_R07C01",
           "205982890034_R02C01":"207107850059_R08C01",
           "205982890034_R07C01":"207107850085_R01C01"}

In [None]:
#List of EPIC v1 and EPIC v2 Samples 
listv1 = list(parings.keys())
print(listv1)
listv2 = list(parings.values())
print(listv2)

# Nearest reference Sample 

In [None]:
def annotate_NN_MethCLass(df):
    points = np.array(df[["UMAP 0", "UMAP 1"]])
    length = len(points)
    for i, point in enumerate(points[-32:]):
        distances = np.linalg.norm(points[:-32]-point, axis=1)
        min_distance_index = np.argmin(distances)
        df.at[length-32+i, "MethText"] = df.at[min_distance_index, "MethClass"]
    return df

# KNN or K nearest neighbor 

In [None]:
def Nearest_Neighbor2(df, k = 15):
    points = np.array(df[["UMAP 0", "UMAP 1"]])
    length = len(points)
    for i, point in enumerate(points[-32:]):
        distances = np.linalg.norm(points[:-32]-point, axis=1)
        closest_indices = np.argsort(distances)[:k]
        closest_samples = df.iloc[closest_indices]
        d = defaultdict(lambda:1)
        for row in closest_samples.itertuples(index=False):
            if row.MethClass in d.keys():
                d[row.MethClass] += 1
            else:
                d[row.MethClass]

        max_value = max(d.values())
        list_of_max_classes = [key for key, value in d.items() if value == max_value]
        min = np.inf
        closes_class = ""
        if len(list_of_max_classes) > 1:
            for entry in list_of_max_classes: 
                indx= df.index[df["MethClass"]==entry].tolist()
                mean = np.mean(points[indx])
                if mean < min:
                    min = mean
                    closes_class = entry
            df.at[length-32+i, "MethText"] = closes_class
        else:
            df.at[length-32+i, "MethText"] = list_of_max_classes[0]
    return df

# Nearest Cluster 

In [None]:
def get_tumor_centroids(df):
    tumor_class_centroids = {}
    for i in df["MethClass"].unique():
        temp = df[df["MethClass"]==i]
        tumor_class_centroids[i] = np.mean(np.array(temp[["UMAP 0", "UMAP 1"]]), axis=0)
    return tumor_class_centroids

In [None]:
def annotate_cluster_MethCLass(df):
    tumor_class_centroids = get_tumor_centroids(df)
    length = len(df.index)
    for i, point in enumerate(np.array(df[-32:][["UMAP 0", "UMAP 1"]])):
        distances = np.linalg.norm(np.array(list(tumor_class_centroids.values()))-point, axis=1) 
        min_distance_index = np.argmin(distances)
        tumor_class = list(tumor_class_centroids.keys())[min_distance_index]
        df.at[length-32+i, "MethText"] = tumor_class
    return df

# Min Max Distance 


In [None]:
def get_min_max_dist(df):
    df2 = df.iloc[:-32]
    df2.sort_values("MethClass")
    points = np.array(df2[["UMAP 0", "UMAP 1"]])
    length = len(df)
    dict1 = {}
    for meth_class in df2["MethClass"].unique():
    # Get indices where the unique entry appears
        indices = df2[df2["MethClass"] == meth_class].index.tolist()
    # Store in the dictionary
        dict1[meth_class] = indices
    for i, point in enumerate(np.array(df.iloc[-32:][["UMAP 0", "UMAP 1"]])):
        dict2 = {}
        distances = np.linalg.norm(points-point, axis=1)
        for key in dict1:
            dict2[key] = np.max(distances[dict1[key]], axis=0)
        smallest_key = min(dict2, key=dict2.get)
        df.at[length-32+i, "MethText"] = smallest_key
    return df



# Remove unneeded columnsplus compare Annotations/Classifications


In [None]:
def split_df(df):
    df1 = df.iloc[-32:-16]
    df2 = df.iloc[-16:]
    return df1, df2

In [None]:
def order_df(df1, df2, list1, list2):
    df1.sort_values(by="SentrixID", key=lambda column: column.map(lambda e: list1.index(e)), inplace=True)
    df2.sort_values(by="SentrixID", key=lambda column: column.map(lambda e: list2.index(e)), inplace=True)
    return df1, df2

In [None]:
def combine_mmeth_text(dfv1, dfv2):
    dfv1.reset_index(drop=True, inplace=True)
    dfv2.reset_index(drop=True, inplace=True)
    dfv1["SentrixID_V2"] = dfv2["SentrixID"]
    dfv1["MethText_V2"] = dfv2["MethText"]
    return dfv1

In [None]:
def compare(df):
    df["Comaprison"] = (df["MethText"]==df["MethText_V2"]).astype(int)
    return df

In [None]:
def strip_split_order_recombine_compare(df):
    a, b = split_df(strip_cols(df))
    c, d = order_df(a, b, listv1, listv2)
    df1 = combine_mmeth_text(c,d)
    df2 = compare(df1)
    return df2

# Loading and analyzing all files 

In [None]:
def analyze_all_files_nearest_ref_sample(path):
    files = os.listdir(path)
    excel_files = [file for file in files if file.endswith('.xlsx')]
    correct_ids = []
    classifications = pd.DataFrame()
    for file in excel_files:
        data = pd.read_excel(os.path.join(path,file))
        data= data.drop(["Unnamed: 0"],axis=1)  
        data = annotate_NN_MethCLass(data)
        classifications["SentrixID"] = data[-32:]["SentrixID"]
        classifications[file] = data[-32:]["MethText"]
        data = strip_split_order_recombine_compare(data)
        correct_ids.append(data["Comaprison"].sum())
    median = np.median(correct_ids)
    std = np.std(correct_ids)
    print(correct_ids)
    print(f"The Median is : {np.median(correct_ids)}")
    print(f"The Std is : {np.std(correct_ids)}")
    return [correct_ids, median, std, classifications]

def analyze_all_files_nearest_cluster(path):
    files = os.listdir(path)
    excel_files = [file for file in files if file.endswith('.xlsx')]
    correct_ids = []
    classifications = pd.DataFrame()
    for file in excel_files:
        data = pd.read_excel(os.path.join(path,file))
        data= data.drop(["Unnamed: 0"],axis=1)  
        data = annotate_cluster_MethCLass(data)
        classifications["SentrixID"] = data[-32:]["SentrixID"]
        classifications[file] = data[-32:]["MethText"]
        data = strip_split_order_recombine_compare(data)
        correct_ids.append(data["Comaprison"].sum())
    median = np.median(correct_ids)
    std = np.std(correct_ids)
    print(correct_ids)
    print(f"The Median is : {np.median(correct_ids)}")
    print(f"The Std is : {np.std(correct_ids)}")
    return [correct_ids, median, std, classifications]

def analyze_all_files_KNN2(path):
    files = os.listdir(path)
    excel_files = [file for file in files if file.endswith('.xlsx')]
    correct_ids = []
    classifications = pd.DataFrame()
    for file in excel_files:
        data = pd.read_excel(os.path.join(path,file))
        data= data.drop(["Unnamed: 0"],axis=1)  
        data = Nearest_Neighbor2(data)
        classifications["SentrixID"] = data[-32:]["SentrixID"]
        classifications[file] = data[-32:]["MethText"]
        data = strip_split_order_recombine_compare(data)
        correct_ids.append(data["Comaprison"].sum())
    median = np.median(correct_ids)
    std = np.std(correct_ids)
    print(correct_ids)
    print(f"The Median is : {np.median(correct_ids)}")
    print(f"The Std is : {np.std(correct_ids)}")
    return [correct_ids, median, std, classifications]

def analyze_all_files_min_max_dist(path):
    files = os.listdir(path)
    excel_files = [file for file in files if file.endswith('.xlsx')]
    correct_ids = []
    classifications = pd.DataFrame()
    for file in excel_files:
        data = pd.read_excel(os.path.join(path,file))
        data= data.drop(["Unnamed: 0"],axis=1)  
        data = get_min_max_dist(data)
        classifications["SentrixID"] = data[-32:]["SentrixID"]
        classifications[file] = data[-32:]["MethText"]
        data = strip_split_order_recombine_compare(data)
        correct_ids.append(data["Comaprison"].sum())
    median = np.median(correct_ids)
    std = np.std(correct_ids)
    print(correct_ids)
    print(f"The Median is : {np.median(correct_ids)}")
    print(f"The Std is : {np.std(correct_ids)}")
    return [correct_ids, median, std, classifications]
