In [None]:
import os
PATH = "/Users/tgoel/Downloads/Classes/GENOME/GENOME541/hw6/"

# os.listdir(PATH) --> ['yeast_chrom7_counts.txt', 'spiral_counts.txt', 'spiral_true_points.txt']

In [None]:
import pandas as pd

def read_data(filename):
    df = pd.read_csv(os.path.join(PATH + filename), sep='\t')
    df.columns = ["bead1", "bead2", "counts"]
    df["distance"] = 10**3 * df["counts"].astype("float16")**-3 # convert the Hi-C counts to distances using the conversion relationship
    df.drop(columns=["counts"], inplace=True)
    bead_ids = pd.concat([df["bead1"], df["bead2"]]).unique()
    return df, bead_ids

data, ids = read_data(os.listdir(PATH)[0]) # includes only pairwise distances from significant Hi-C counts (FDR 1%)
data

In [None]:
import numpy as np
DIM3D = True # False for 2D, True for 3D

class Bead():

    def __init__(self, id, x, y, z):
        self.id = id
        self.x = x
        self.y = y
        self.z = z
        self.history = [(x, y, z)]

    def distance(self, bead):
        if DIM3D:
            return np.sqrt((self.x - bead.x)**2 + (self.y - bead.y)**2 + (self.z - bead.z)**2)
        return np.sqrt((self.x - bead.x)**2 + (self.y - bead.y)**2)
        
    def update(self, x_step, y_step, z_step):
        self.x += x_step
        self.y += y_step
        self.z += z_step
        self.history.append((self.x, self.y, self.z))

In [None]:
# place beads randomly in 3D space
def place_beads(bead_ids):
    n_beads = len(bead_ids)
    xs = np.random.rand(n_beads)
    ys = np.random.rand(n_beads)
    zs = np.random.rand(n_beads)
    beads = [Bead(id, x, y, z) for id, x, y, z in zip(bead_ids, xs, ys, zs)]
    return dict(zip(ids, beads)) # {bead_id: Bead object}

beads = place_beads(ids)

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

def plot_beads(beads):
    x, y, z = [bead.x for bead in beads], [bead.y for bead in beads], [bead.z for bead in beads]
    if DIM3D == True:
        px.scatter_3d(x=x, y=y, z=z, opacity=0.5).show()
    else:
        plt.scatter(x, y)

plot_beads(beads.values())

Multidimensional Scaling (MDS) Objective Function:

$f(\boldsymbol{x}) = \sum_{i,j \in D}(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2$

$||\boldsymbol{x}_i - \boldsymbol{x}j||_2 = \sqrt{\sum_{k=1}^{K} (x_{ik} - x_{jk})^2}$

$\boldsymbol{x}_{ik}$ = inferred position of bead i in the kth dimension

$K$ = number of dimensions

$\delta_{ij}$ = observed distance between beads i and j

$D$ = set of pairs of beads with observed distances

$\frac{\partial f(\boldsymbol{x})}{\partial x_{ik}} =$ ?

WORK:

differentiate objective function with respect to $x_{ik}$:

$\frac{\partial f(\boldsymbol{x})}{\partial x_{ik}} = \frac{\partial}{\partial x_{ik}}\sum_{i,j \in D}(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2$

chain rule --> differentiate squared norm term:

$(||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2 = \sum_{k=1}^{K}(x_{ik} - x_{jk})^2$

$\frac{\partial}{\partial x_{ik}} (||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2 = \frac{\partial}{\partial x_{ik}}\sum_{l=1}^{K}(x_{il} - x_{jl})^2$

$= 2(x_{ik} - x_{jk})$

differentiate squared difference between observed and inferred distances:

$\frac{\partial}{\partial x_{ik}} (\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2 = -2(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}j||_2)(x_{ik} - x_{jk})$

final answer:

$\frac{\partial f(\boldsymbol{x})}{\partial x_{ik}} = -2\sum_{i, j \in D}(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)(x_{ik} - x_{jk})$

In [None]:
# optimize the multidimensional scaling objective function using gradient descent
def step(lr=0.001):
    
    # calculate the pairwise distances between each bead
    data["obs_dist"] = data.apply(lambda row: beads[row["bead1"]].distance(beads[row["bead2"]]), axis=1)

    # calculate the MDS objective function
    diff = data["obs_dist"].values - data["distance"].values
    obj = np.sum(diff ** 2)

    # calculate the partial derivative of the MDS objective function with respect to the coordinates of each bead
    data["dLdx"] = data.apply(lambda row: -2 * np.sum(row["obs_dist"] - row["distance"]) * (beads[row["bead1"]].x - beads[row["bead2"]].x), axis=1)
    data["dLdy"] = data.apply(lambda row: -2 * np.sum(row["obs_dist"] - row["distance"]) * (beads[row["bead1"]].y - beads[row["bead2"]].y), axis=1)
    data["dLdz"] = data.apply(lambda row: -2 * np.sum(row["obs_dist"] - row["distance"]) * (beads[row["bead1"]].z - beads[row["bead2"]].z), axis=1)

    # update the coordinates of each bead using gradient descent
    for idx in range(len(data)):
        row = data.loc[idx]
        beads[row["bead1"]].update(lr * row["dLdx"], lr * row["dLdy"], lr * row["dLdz"])
        beads[row["bead2"]].update(-lr * row["dLdx"], -lr * row["dLdy"], -lr * row["dLdz"])

    return obj

In [None]:
# run gradient descent until convergence
hist = []
while True:
    hist.append(step())
    print("Loss:", hist[-1])
    if len(hist) > 1 and hist[-2] - hist[-1] < 0.001:
        break

In [None]:
plot_beads(beads.values())

In [None]:
# track a random point
point_hist = beads[np.random.choice(list(beads.keys()))].history
px.scatter_3d(x=[h[0] for h in point_hist], y=[h[1] for h in point_hist], z=[h[2] for h in point_hist], opacity=0.5).show()