In [15]:
!pip install pandas numpy matplotlib scikit-learn networkx




In [16]:
# Install torch geometric dependencies for CPU in Colab
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.0.0+cpu.html


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html


In [17]:
import pandas as pd

file_path = "node_embeddings1.csv"

try:
    df = pd.read_csv(file_path, delimiter=',', engine='python', on_bad_lines='skip')
except Exception as e:
    print(f"Error reading CSV: {e}")
    raise

# Strip and inspect
df.columns = df.columns.str.strip()
print("Columns:", df.columns)
print(df.head())



Columns: Index(['id', 'clause', 'speaker', 'emotion_type', 'detected_emotion',
       'embedding'],
      dtype='object')
            id                                         clause speaker  \
0  tr_4466_1_1         Hey , you wanna see a movie tomorrow ?       A   
1  tr_4466_2_1                      Sounds like a good plan .       B   
2  tr_4466_2_2                      What do you want to see ?       B   
3  tr_4466_3_1                     How about Legally Blonde .       A   
4  tr_4466_4_1  Ah , my girlfriend wanted to see that movie .       B   

  emotion_type detected_emotion  \
0         both        happiness   
1         both        happiness   
2         both        happiness   
3        cause          neutral   
4        cause          neutral   

                                           embedding  
0  -0.08010987192392349,-0.031882286071777344,0.0...  
1  -0.02023392915725708,0.0656646192073822,0.0207...  
2  -0.015728836879134178,-0.020934388041496277,-0...  
3  -0.00

In [18]:
print(df.columns.tolist())


['id', 'clause', 'speaker', 'emotion_type', 'detected_emotion', 'embedding']


In [19]:
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Only do this once before graph construction
df['embedding'] = df['embedding'].apply(lambda x: np.array([float(i) for i in x.split(',')]))


In [20]:
df = df.iloc[:1000]
print(df)


              id                                            clause speaker  \
0    tr_4466_1_1            Hey , you wanna see a movie tomorrow ?       A   
1    tr_4466_2_1                         Sounds like a good plan .       B   
2    tr_4466_2_2                         What do you want to see ?       B   
3    tr_4466_3_1                        How about Legally Blonde .       A   
4    tr_4466_4_1     Ah , my girlfriend wanted to see that movie .       B   
..           ...                                               ...     ...   
995  tr_1086_2_1               Yes , there is a lake in the park .       B   
996  tr_1086_2_2                            It is very beautiful .       B   
997  tr_1086_3_1                                  That ' s great .       A   
998  tr_1086_3_2  We can go boating on the lake in the afternoon .       A   
999  tr_1086_4_1                              It ' s a good idea .       B   

    emotion_type detected_emotion  \
0           both        ha

In [21]:
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import spacy

In [22]:
embeddings = np.stack(df['embedding'].values)
clause_ids = df['id'].tolist()
id_to_index = {cid: idx for idx, cid in enumerate(clause_ids)}  # Maps clause ID to index

# ------------------------
# 2. Create Graph Edges Based on Cosine Similarity
# ------------------------
cos_sim = cosine_similarity(embeddings)
threshold = 0.7
edge_index = [[], []]

for i in range(len(df)):
    for j in range(len(df)):
        if i != j and cos_sim[i][j] > threshold:
            edge_index[0].append(i)
            edge_index[1].append(j)

edge_index = torch.tensor(edge_index, dtype=torch.long)

# ------------------------
# 3. Load Emotion-Cause Pairs and Construct Label Matrix
# ------------------------
pair_df = pd.read_csv("emotion_cause_pairs.csv")

label_matrix = np.zeros((len(df), len(df)), dtype=np.float32)

for _, row in pair_df.iterrows():
    cause_id = row['cause_id']
    emotion_id = row['emotion_id']
    
    if cause_id in id_to_index and emotion_id in id_to_index:
        i = id_to_index[cause_id]
        j = id_to_index[emotion_id]
        label_matrix[i][j] = 1.0  # Binary label for valid emotion-cause pair

label_matrix_tensor = torch.tensor(label_matrix)

# ------------------------
# 4. Graph Data Object
# ------------------------
x = torch.tensor(embeddings, dtype=torch.float32)
data = Data(x=x, edge_index=edge_index)

# ------------------------
# 5. Define DyGCN Model
# ------------------------
class DyGCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, pair_hidden=32):
        super().__init__()
        self.gcn = GCNConv(in_channels, hidden_channels)
        self.pair_mlp = nn.Sequential(
            nn.Linear(hidden_channels * 4, pair_hidden),
            nn.ReLU(),
            nn.Linear(pair_hidden, 1)
        )

    def forward(self, data):
        h = self.gcn(data.x, data.edge_index)
        num_nodes = h.size(0)

        pair_vectors = []
        for i in range(num_nodes):
            for j in range(num_nodes):
                hi = h[i]
                hj = h[j]
                pair_feat = torch.cat([hi, hj, hi * hj, hi - hj], dim=-1)
                pair_vectors.append(pair_feat)

        pair_vectors = torch.stack(pair_vectors)
        scores = self.pair_mlp(pair_vectors).squeeze()
        return scores.view(num_nodes, num_nodes)

# ------------------------
# 6. Training Setup
# ------------------------
def compute_loss(preds, labels):
    criterion = nn.BCEWithLogitsLoss()
    return criterion(preds.view(-1), labels.view(-1))

model = DyGCN(in_channels=x.size(1), hidden_channels=128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ------------------------
# 8. Define Accuracy Calculation
# ------------------------
def calculate_accuracy(preds, labels, threshold=0.5):
    # Apply threshold to predictions (0 or 1)
    preds = (torch.sigmoid(preds) > threshold).float()
    correct = (preds == labels).sum().item()
    total = labels.numel()  # Total number of labels
    accuracy = correct / total
    return accuracy

# ------------------------
# 9. Training Loop with Accuracy
# ------------------------
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    output = model(data)  # shape: (N, N)
    loss = compute_loss(output, label_matrix_tensor)

    # Compute accuracy
    accuracy = calculate_accuracy(output, label_matrix_tensor)
    
    loss.backward()
    optimizer.step()

    # Print loss and accuracy every epoch
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")


Epoch 1/10, Loss: 0.7557, Accuracy: 0.0016
Epoch 2/10, Loss: 0.7464, Accuracy: 0.0016
Epoch 3/10, Loss: 0.7371, Accuracy: 0.0016
Epoch 4/10, Loss: 0.7273, Accuracy: 0.0016
Epoch 5/10, Loss: 0.7167, Accuracy: 0.0096
Epoch 6/10, Loss: 0.7049, Accuracy: 0.1531
Epoch 7/10, Loss: 0.6914, Accuracy: 0.5541
Epoch 8/10, Loss: 0.6759, Accuracy: 0.8641
Epoch 9/10, Loss: 0.6579, Accuracy: 0.9669
Epoch 10/10, Loss: 0.6374, Accuracy: 0.9909


In [23]:
# Save the model after training
torch.save(model.state_dict(), 'dygcn_model.pth')
print("Model saved as dygcn_model.pth")


Model saved as dygcn_model.pth


In [28]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.data import Data
import torch.nn as nn

# Assuming model and other variables are already set up and trained

# ------------------------
# Use the last 25 rows from the dataframe
# ------------------------

df_last_25 = df.tail(25)

# Convert embeddings to numpy array for the last 25 rows
embeddings = np.stack(df_last_25['embedding'].values)
clause_ids = df_last_25['id'].tolist()
id_to_index = {cid: idx for idx, cid in enumerate(clause_ids)}

# Create cosine similarity matrix for the last 25 embeddings
cos_sim = cosine_similarity(embeddings)
threshold = 0.7
edge_index = [[], []]

for i in range(len(df_last_25)):
    for j in range(len(df_last_25)):
        if i != j and cos_sim[i][j] > threshold:
            edge_index[0].append(i)
            edge_index[1].append(j)

edge_index = torch.tensor(edge_index, dtype=torch.long)

# Prepare the graph data object
x = torch.tensor(embeddings, dtype=torch.float32)
data = Data(x=x, edge_index=edge_index)

# ------------------------
# Load the trained DyGCN model
# ------------------------

model = DyGCN(in_channels=x.size(1), hidden_channels=128)
model.load_state_dict(torch.load("dygcn_model.pth"))
model.eval()  # Set model to evaluation mode

# ------------------------
# Make predictions
# ------------------------

# Get the output from the model (predictions)
with torch.no_grad():
    output = model(data)

# Apply sigmoid to get probabilities
predictions = torch.sigmoid(output)

# Now, make predictions on emotion-cause pairs
# Apply a lower threshold to classify pairs
threshold = 0.3  # Lower the threshold to capture more pairs
predictions = (predictions > threshold).float()

# Convert predictions into a DataFrame or list of predicted pairs
predicted_pairs = []

# Iterate over the prediction matrix
for i in range(predictions.size(0)):
    for j in range(predictions.size(1)):
        if predictions[i, j] == 1:
            cause_id = clause_ids[i]
            emotion_id = clause_ids[j]
            cause_sentence = df_last_25.iloc[i]['clause']
            emotion_sentence = df_last_25.iloc[j]['clause']
            predicted_pairs.append((cause_id, emotion_id, cause_sentence, emotion_sentence))

# Create a DataFrame for the predicted pairs with sentences
predicted_pairs_df = pd.DataFrame(predicted_pairs, columns=["cause_id", "emotion_id", "cause_sentence", "emotion_sentence"])

# Show the predicted pairs
print(predicted_pairs_df)




        cause_id   emotion_id                  cause_sentence  \
0     tr_924_6_2   tr_924_6_2  The water was too cold , huh ?   
1     tr_924_6_2   tr_924_6_3  The water was too cold , huh ?   
2     tr_924_6_2   tr_924_6_4  The water was too cold , huh ?   
3     tr_924_6_2   tr_924_6_5  The water was too cold , huh ?   
4     tr_924_6_2   tr_924_6_6  The water was too cold , huh ?   
..           ...          ...                             ...   
620  tr_1086_4_1  tr_1086_2_1            It ' s a good idea .   
621  tr_1086_4_1  tr_1086_2_2            It ' s a good idea .   
622  tr_1086_4_1  tr_1086_3_1            It ' s a good idea .   
623  tr_1086_4_1  tr_1086_3_2            It ' s a good idea .   
624  tr_1086_4_1  tr_1086_4_1            It ' s a good idea .   

                                      emotion_sentence  
0                       The water was too cold , huh ?  
1                           I ' ll tell you a secret .  
2     Do you see that small pool of water over t

In [29]:
# Save the predicted pairs with sentences to a CSV file
predicted_pairs_df.to_csv("predicted_emotion_cause_pairs.csv", index=False)

# Print a message confirming the file has been saved
print("Predicted emotion-cause pairs saved to 'predicted_emotion_cause_pairs.csv'.")


Predicted emotion-cause pairs saved to 'predicted_emotion_cause_pairs.csv'.
