### Bipartite Network

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
df = pd.read_csv("C:/Users/Raphaela/Documents/MA_Studium/4_Semester/MA_Thesis/02_data/url_titles_domains.csv")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5195 entries, 0 to 5194
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               5195 non-null   int64  
 1   url              5195 non-null   object 
 2   start_date       5195 non-null   object 
 3   end_date         5195 non-null   object 
 4   title            5195 non-null   object 
 5   domain           5195 non-null   object 
 6   pc1              5195 non-null   float64
 7   chat_idx         5195 non-null   int64  
 8   start_year       5195 non-null   int64  
 9   start_month      5195 non-null   int64  
 10  start_day        5195 non-null   int64  
 11  start_hour       5195 non-null   int64  
 12  title_embedding  5195 non-null   object 
dtypes: float64(1), int64(6), object(6)
memory usage: 527.7+ KB


In [9]:
pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
     ---------------------------------------- 0.0/63.1 kB ? eta -:--:--
     ---------------------------------------- 63.1/63.1 kB 1.7 MB/s eta 0:00:00
Collecting aiohttp (from torch_geometric)
  Downloading aiohttp-3.10.10-cp311-cp311-win_amd64.whl.metadata (7.8 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->torch_geometric)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->torch_geometric)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->torch_geometric)
  Downloading frozenlist-1.5.0-cp311-cp311-win_amd64.whl.metadata (14 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->torch_geometric)
  Downloading multidict-6.1.0-cp311-cp311-win_amd64.whl.metadata (5.1 kB)
Collecting yarl<2.0,>=1.12.0 (from aiohttp->torch_geometric)
  Downloading yarl-1.16.0-cp311-cp311-


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\Raphaela\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [16]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_scipy_sparse_matrix
from scipy.sparse import coo_matrix
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.data import DataLoader

In [13]:
# Step 1: Group by 'domain' and 'chat_idx'
grouped_data = df.groupby(['domain', 'chat_idx']).size().reset_index(name='count')

# Step 2: Create a mapping for domains and chat_idx to numerical indices
domain_mapping = {domain: i for i, domain in enumerate(grouped_data['domain'].unique())}
chat_mapping = {chat: i for i, chat in enumerate(grouped_data['chat_idx'].unique())}

# Step 3: Create the bipartite edge list
row = grouped_data['domain'].map(domain_mapping).values
col = grouped_data['chat_idx'].map(chat_mapping).values
data = grouped_data['count'].values

# Step 4: Create a sparse matrix for the bipartite graph
bipartite_matrix = coo_matrix((data, (row, col)), shape=(len(domain_mapping), len(chat_mapping)))

# Step 5: Convert the sparse matrix to a PyTorch Geometric data format
edge_index, edge_weight = from_scipy_sparse_matrix(bipartite_matrix)
# bipartite_graph = Data(edge_index=edge_index, edge_weight=edge_weight)

In [14]:
# Step 1: Prepare features
num_domains = len(domain_mapping)
num_chats = len(chat_mapping)

# Example: Initialize node features randomly for simplicity
domain_features = torch.rand(num_domains, 16)  # 16 is an arbitrary feature dimension
chat_features = torch.rand(num_chats, 16)

# Combine domain and chat features into a single tensor
node_features = torch.cat([domain_features, chat_features], dim=0)

# Step 2: Set up the target values (pc1) for domain nodes
pc1_values = torch.zeros(num_domains + num_chats)
pc1_values[:num_domains] = torch.tensor(df.groupby('domain')['pc1'].first().map(domain_mapping))

# Step 3: Create the PyTorch Geometric data object
bipartite_graph = Data(x=node_features, edge_index=edge_index, edge_weight=edge_weight, y=pc1_values)

  pc1_values[:num_domains] = torch.tensor(df.groupby('domain')['pc1'].first().map(domain_mapping))


In [17]:
# Step 4: Define the GNN model
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(16, 32)
        self.conv2 = GCNConv(32, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First GCN layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        # Second GCN layer
        x = self.conv2(x, edge_index)
        return x

In [18]:
# Step 5: Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

bipartite_graph = bipartite_graph.to(device)

In [25]:
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(bipartite_graph)
    loss = criterion(out[bipartite_graph.train_mask], bipartite_graph.y[bipartite_graph.train_mask])
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Step 6: Evaluate the model
model.eval()
pred = model(bipartite_graph).detach().cpu().numpy()

KeyError: 0

In [24]:
# Assuming num_domains is the number of domain nodes
num_nodes = bipartite_graph.num_nodes
num_nodes
num_domains

552

In [32]:
assert not torch.isnan(bipartite_graph.x).any(), "Node features contain NaN"
assert not torch.isnan(bipartite_graph.y).any(), "Target values contain NaN"

In [31]:
bipartite_graph.y = torch.nan_to_num(bipartite_graph.x, nan=0.0)

In [34]:
# Optionally normalize features
bipartite_graph.x = (bipartite_graph.x - bipartite_graph.x.mean(dim=0)) / (bipartite_graph.x.std(dim=0) + 1e-6)

# Set edge weights to 1 if they are not already set or if they are problematic
bipartite_graph.edge_weight = torch.ones(bipartite_graph.edge_index.size(1))

In [37]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [38]:
# Step 1: Create train-test split indices
train_size = int(0.8 * num_domains)  # Use 80% of the domain nodes for training
indices = np.random.permutation(num_domains)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

# Step 2: Create train and test masks
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# Set the masks for the domain nodes
train_mask[train_indices] = True
test_mask[test_indices] = True

# Step 3: Add the masks to the bipartite_graph data object
bipartite_graph.train_mask = train_mask
bipartite_graph.test_mask = test_mask

# Now you can proceed with training
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(bipartite_graph)
    loss = criterion(out[bipartite_graph.train_mask], bipartite_graph.y[bipartite_graph.train_mask])

    # Check if loss is NaN during training
    if torch.isnan(loss):
        print(f"NaN loss at epoch {epoch+1}")
        break

    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Step 4: Evaluate the model on the test set
model.eval()
pred = model(bipartite_graph).detach().cpu().numpy()
test_loss = criterion(out[bipartite_graph.test_mask], bipartite_graph.y[bipartite_graph.test_mask])
print(f'Test Loss: {test_loss.item()}')

NaN loss at epoch 1
Test Loss: nan
