In [1]:
import json
import numpy as np
import networkx as nx

# Load graph data
with open('ppi-G.json', 'r') as f:
    graph_data = json.load(f)

# Load node features
node_features = np.load('ppi-feats.npy')

# Load ID mapping
with open('ppi-id_map.json', 'r') as f:
    id_map = json.load(f)

# Optional: Load class labels and random walks if needed
with open('ppi-class_map.json', 'r') as f:
    class_map = json.load(f)
walks = []
with open('ppi-walks.txt', 'r') as f:
    for line in f:
        walks.append(line.strip().split())


In [2]:
G = nx.Graph()  # Considering it might be an undirected graph

# Add edges from graph_data
for edge in graph_data['links']:
    G.add_edge(edge['source'], edge['target'])

# If there are node features, weights, or other attributes, you might want to add them too.


In [3]:
from gensim.models import Word2Vec

# Learn embeddings
model = Word2Vec(walks, vector_size=64, window=5, min_count=0, sg=1, workers=4, epochs=1)


In [4]:
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE
import pandas as pd
# 1. Extract embeddings and labels
labels = model.wv.index_to_key
embeddings = np.array([model.wv[label] for label in labels])

# 2. Reduce dimensions using t-SNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
embeddings_2d = tsne.fit_transform(embeddings)

# 3. Plot using Plotly
df = pd.DataFrame(embeddings_2d, columns=["x", "y"])
df['label'] = labels

fig = px.scatter(df, x='x', y='y', hover_data=['label'])
fig.show()

In [5]:
import mlflow
from mlflow import sklearn
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, hamming_loss
# Assuming you have a dataset with labels (class_map) and embeddings (embeddings)

# Prepare the data
X = embeddings
y = np.array([class_map[str(label)] for label in labels])  # Convert class_map to NumPy array

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run() as run:
    # Create a multi-label classifier (Logistic Regression as base classifier)
    classifier = MultiOutputClassifier(LogisticRegression(random_state=42))
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Log model
    mlflow.sklearn.log_model(classifier, "model")
    
    # Make predictions on the test data
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model performance
    report = classification_report(y_test, y_pred, zero_division=1, output_dict=True)
    hamming = hamming_loss(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("hamming_loss", hamming)
    for label, metrics in report.items():
        for metric_name, value in metrics.items():
            if isinstance(value, (float, int)):  # Skip non-numeric values to prevent errors
                mlflow.log_metric(f"{label}_{metric_name}", value)
