In [13]:
import pandas as pd
import os
PATH = "/Users/tgoel/Downloads/Classes/GENOME/GENOME541/hw6/"
# os.listdir(PATH) --> ['yeast_chrom7_counts.txt', 'spiral_counts.txt', 'spiral_true_points.txt']

def read_data(filename):
    df = pd.read_csv(os.path.join(PATH + filename), sep='\t')
    df["distance"] = 10**3 * df["counts"].astype("float16")**-3 # convert the Hi-C counts to distances using the conversion relationship
    df.drop(columns=["counts"], inplace=True)
    bead_ids = pd.concat([df["bead1"], df["bead2"]]).unique()
    return df, bead_ids

data, ids = read_data('yeast_chrom7_counts.txt') # includes only pairwise distances from significant Hi-C counts (FDR 1%)
data.head()

Unnamed: 0,bead1,bead2,distance
0,0,2,0.000536
1,0,3,0.000119
2,0,4,0.008523
3,0,5,0.000834
4,0,6,0.013471


In [14]:
import numpy as np
DIM3D = True

class Bead():

    def __init__(self, id, x, y, z):
        self.id = id
        self.x = x
        self.y = y
        self.z = z
        self.history = [(x, y, z)]

    def distance(self, bead):
        if DIM3D:
            return np.sqrt((self.x - bead.x)**2 + (self.y - bead.y)**2 + (self.z - bead.z)**2)
        return np.sqrt((self.x - bead.x)**2 + (self.y - bead.y)**2)
        
    def update(self, x_step, y_step, z_step):
        self.x += x_step
        self.y += y_step
        self.z += z_step
        self.history.append((self.x, self.y, self.z))

In [15]:
def place_beads(bead_ids):
    n_beads = len(bead_ids)
    xs = np.random.rand(n_beads)
    ys = np.random.rand(n_beads)
    zs = np.random.rand(n_beads)
    beads = [Bead(id, x, y, z) for id, x, y, z in zip(bead_ids, xs, ys, zs)]
    return dict(zip(ids, beads)) # {bead_id: Bead object}

beads = place_beads(ids)

In [16]:
import matplotlib.pyplot as plt
import plotly.express as px

def plot_beads(beads):
    x, y, z = [bead.x for bead in beads], [bead.y for bead in beads], [bead.z for bead in beads]
    if DIM3D == True:
        px.scatter_3d(x=x, y=y, z=z, opacity=0.5).show()
    else:
        plt.scatter(x, y)

plot_beads(beads.values())

Multidimensional Scaling (MDS) Objective Function:

$f(\boldsymbol{x}) = \sum_{i,j \in D}(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2$

$||\boldsymbol{x}_i - \boldsymbol{x}j||_2 = \sqrt{\sum_{k=1}^{K} (x_{ik} - x_{jk})^2}$

$\boldsymbol{x}_{ik}$ = inferred position of bead i in the kth dimension

$K$ = number of dimensions

$\delta_{ij}$ = observed distance between beads i and j

$D$ = set of pairs of beads with observed distances

$\frac{\partial f(\boldsymbol{x})}{\partial x_{ik}} =$ ?

convex?

WORK:

$(||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2 = \sum_{k=1}^{K}(x_{ik} - x_{jk})^2$

differentiate objective function with respect to $x_{ik}$:

$\frac{\partial f(\boldsymbol{x})}{\partial x_{ik}} = \frac{\partial}{\partial x_{ik}}\sum_{i,j \in D}(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2$

chain rule --> differentiate squared norm term:

$\frac{\partial}{\partial x_{ik}} (||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2 = \frac{\partial}{\partial x_{ik}}\sum_{l=1}^{K}(x_{il} - x_{jl})^2$

$= 2(x_{ik} - x_{jk})$

differentiate squared difference between observed and inferred distances:

$\frac{\partial}{\partial x_{ik}} (\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)^2 = -2(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}j||_2)(x_{ik} - x_{jk})$

final answer:

$\frac{\partial f(\boldsymbol{x})}{\partial x_{ik}} = -2\sum_{i, j \in D}(\delta_{ij} - ||\boldsymbol{x}_i - \boldsymbol{x}_j||_2)(x_{ik} - x_{jk})$

In [17]:
def step(lr=0.001):
    
    # calculate the pairwise distances between each bead
    data["obs_dist"] = data.apply(lambda row: beads[row["bead1"]].distance(beads[row["bead2"]]), axis=1)

    # calculate the MDS objective function
    diff = data["obs_dist"].values - data["distance"].values
    obj = np.sum(diff ** 2)

    # calculate the partial derivative of the MDS objective function with respect to the coordinates of each bead
    data["dLdx"] = data.apply(lambda row: -2 * np.sum(row["obs_dist"] - row["distance"]) * (beads[row["bead1"]].x - beads[row["bead2"]].x), axis=1)
    data["dLdy"] = data.apply(lambda row: -2 * np.sum(row["obs_dist"] - row["distance"]) * (beads[row["bead1"]].y - beads[row["bead2"]].y), axis=1)
    data["dLdz"] = data.apply(lambda row: -2 * np.sum(row["obs_dist"] - row["distance"]) * (beads[row["bead1"]].z - beads[row["bead2"]].z), axis=1)

    # update the coordinates of each bead using gradient descent
    for idx in range(len(data)):
        row = data.loc[idx]
        beads[row["bead1"]].update(lr * row["dLdx"], lr * row["dLdy"], lr * row["dLdz"])
        beads[row["bead2"]].update(-lr * row["dLdx"], -lr * row["dLdy"], -lr * row["dLdz"])

    return obj

In [18]:
hist = []
while True:
    hist.append(step())
    print("Loss:", hist[-1])
    if len(hist) > 1 and hist[-2] - hist[-1] < 0.001:
        break

Loss: 6270.519270945546
Loss: 6151.7466325895075
Loss: 6039.277077007024
Loss: 5933.832386238435
Loss: 5835.805049650286
Loss: 5745.275893527728
Loss: 5662.099323339211
Loss: 5585.967465009085
Loss: 5516.455311487933
Loss: 5453.08630160071
Loss: 5395.350647765596
Loss: 5342.678475064691
Loss: 5294.448434851982
Loss: 5250.009829698242
Loss: 5208.703521479977
Loss: 5169.883777751884
Loss: 5132.9299501991045
Loss: 5097.237220381728
Loss: 5062.273202123467
Loss: 5027.65112773976
Loss: 4993.043444188322
Loss: 4958.113137989444
Loss: 4922.509799237077
Loss: 4885.870259527578
Loss: 4847.809621544287
Loss: 4807.8760199287435
Loss: 4765.534312810097
Loss: 4720.409376447846
Loss: 4672.096531767964
Loss: 4620.226503363163
Loss: 4564.502927533371
Loss: 4504.669883921821
Loss: 4440.475857864311
Loss: 4371.655088144555
Loss: 4298.20946041778
Loss: 4220.416088934293
Loss: 4138.738935981999
Loss: 4053.9210528416784
Loss: 3966.89765588481
Loss: 3878.696923040882
Loss: 3790.4074293378308
Loss: 3703.1298

In [20]:
plot_beads(beads.values())