# MEFISTO Model Building

### This script performs a MEFISTO (Multi-Omics Factor Analysis with Spatial and Temporal Structure) analysis using metal abundance and spatial transcriptomics data. The script includes data preprocessing, spatial filtering, and training of the MEFISTO model.

## Model Building

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from mofapy2.run.entry_point import entry_point
import mofapy2
import argparse
import h5py
import alphashape
from shapely.geometry import MultiPoint, Point
import geopandas as gpd
import libpysal
from sklearn.metrics import pairwise_distances

### Load and preprocess metal abundance data

In [None]:
with open("./grouped_ST_metal_spots_records_edge_filtered.pkl", "rb") as fh:
    metal_abundances = pd.DataFrame.from_records(pickle.load(fh)).drop(columns=["index"])

### Compute spatial distances

In [None]:
np.random.seed(42)
s = pd.Series(pairwise_distances(metal_abundances[["ST_x", "ST_y"]]).flatten())
n_neighbors_drop = 2

### Apply Alpha Shape filtering to remove edge effects

In [None]:
alpha_shape = alphashape.alphashape(metal_abundances[["ST_x", "ST_y"].values], 1/(s[s>0].min()*np.sqrt(2)))
points = gpd.GeoSeries([Point(*x) for x in metal_abundances[["ST_x", "ST_y"]].values.tolist()])
metal_abundances = metal_abundances[(~points.intersects(alpha_shape.exterior.buffer(s[s>0].min()*np.sqrt(2)*n_neighbors_drop))).values]

### Load spatial transcriptomics data and remap metal abundances

In [None]:
adata_st_metals = sc.read_h5ad(filename="metal_ST.h5ad")
metal_abundances_v2 = adata_st_metals.obsm["metals"]
metal_coords_v2 = metal_abundances_v2[["ST_x", "ST_y"]].reset_index(drop=True)

### Remap indices

In [None]:
mapping = np.array(np.where(pairwise_distances(metal_coords_v2, metal_abundances[["ST_x", "ST_y"]]) == 0)).T[:,0]
adata_st_metals_remapped = adata_st_metals[mapping]
adata_st_metals_remapped.write_h5ad(filename="metal_ST_remapped_mefisto.h5ad")

### Prepare data for MEFISTO

In [None]:
data = pd.read_pickle("MEFISTO_model_all_data.pkl")
metal_view = adata_st_metals_remapped.obsm["metals"].reset_index(drop=True)
metal_view = metal_view.loc[:, "Ba138":"As75"].assign(sample=pd.Series(np.arange(len(metal_view))).map(lambda x: f"sample{x}"))
metal_view = metal_view.melt(id_vars=["sample"]).assign(view="metals")
metal_view.columns = ["sample", "feature", "value", "view"]
data = pd.concat([data.query("view!='metals'"), metal_view])
data["value"] = data["value"].astype(float)
data = data.drop_duplicates(subset=["view", "feature", "sample"])

### Log transform spatial and single-cell data

In [None]:
transposed_cov = pd.read_pickle("./metal_coord_covariate_2.pkl")
data.loc[data["view"] == "ST", "value"] = np.log1p(data.loc[data["view"] == "ST", "value"])
data.loc[data["view"] == "SC", "value"] = np.log1p(data.loc[data["view"] == "SC", "value"])

### Initialize and configure MEFISTO model

In [None]:
ent = entry_point()

In [None]:
ent.set_data_df(data, likelihoods=["gaussian", "gaussian", "gaussian"])
ent.set_covariates(transposed_cov, covariates_names=["x_coordinate", "y_coordinate"])
ent.set_data_options(scale_views=False, use_float32=True)
ent.set_model_options(factors=20, spikeslab_weights=True, ard_weights=True)
ent.set_train_options(convergence_mode="fast", dropR2=0.001, gpu_mode=True, seed=1, save_interrupted=True)

### Set smoothing options and build the model

In [None]:
n_inducing = 1000
ent.set_smooth_options(sparseGP=True, frac_inducing=n_inducing/3984, start_opt=10, opt_freq=10)
ent.build()

### Run the MEFISTO model

In [None]:
ent.run()

In [None]:
# Save the trained model
ent.save("./MEFISTO_model_v4.hdf5")