<a href="https://colab.research.google.com/github/Shreyash54/HealthMisinfoDetection/blob/main/GRAPHSAGE3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import SAGEConv
import itertools
import networkx as nx
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

# Read statements and features from a CSV file (replace 'statements.csv' with your file)
df = pd.read_csv("/content/drive/MyDrive/cleaned_dataset.csv")

# Create an empty graph
G = nx.Graph()

# Add nodes (statements) to the graph
for index, row in df.iterrows():
    statement_id = row['id']
    statement_text = row['statement']
    statement_rating = row['rating']  # If you have TF-IDF vectors

    # Add the statement as a node and assign attributes
    G.add_node(statement_id, text=statement_text, rating=statement_rating)

# Sample data (replace with your actual data)
statements = df['statement'].tolist()
ratings = df['rating'].tolist()

# Train Word2Vec model (replace with your own training process)
word2vec_model = Word2Vec(statements, vector_size=100, window=5, min_count=1, sg=0)

# Create an empty list to store statement embeddings
statement_embeddings = []

# Loop through your statements and compute embeddings
for statement in statements:
    tokens = statement.split()  # Tokenize your statement
    # Get the embeddings for each token and average them to get the statement embedding
    token_embeddings = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]

    if token_embeddings:
        # Compute the mean embedding for the statement
        statement_embedding = np.mean(token_embeddings, axis=0)
        statement_embeddings.append(statement_embedding)

# Convert the list of embeddings to a NumPy array
statement_embeddings = np.array(statement_embeddings, dtype=np.float32)

# Convert the NumPy array to a PyTorch tensor
statement_embeddings = torch.tensor(statement_embeddings, dtype=torch.float)

# Sample ratings (replace with your actual ratings)
ratings = df['rating'].tolist()

# Define a mapping from string labels to numerical values
rating_mapping = {
    "mixture": 0.0,
    "unknown": 1.0,
    "TRUE": 2.0,
    "FALSE": 3.0,
    # Add more mappings as needed
}



# Convert the ratings to numerical values using the mapping or leave as is if already numerical
numerical_ratings = [rating_mapping[r] if r in rating_mapping else r for r in ratings]

# Filter out the ratings that are not numerical
numerical_ratings = [r for r in numerical_ratings if isinstance(r, (float, int))]

# Convert the numerical ratings to a tensor
rating_tensor = torch.tensor(numerical_ratings, dtype=torch.float).view(-1, 1)


# Create edges between all pairs of statements (assuming you want an undirected graph)
# Create edges between valid pairs of statements (assuming you want an undirected graph)
edges = []
num_statements = len(statements)  # Get the number of statements

for src, dst in itertools.combinations(range(num_statements), 2):
    edges.append((src, dst))

# Convert edges to tensor
edges = torch.tensor(edges, dtype=torch.long).t().contiguous()

# Assuming you want an undirected graph, add the reverse edges
edges = torch.cat([edges, edges[[1, 0]]], dim=1)

# Ensure there are no self-loops
edges = edges[:, edges[0] != edges[1]]

# Create edge_index tensor
edge_index = edges



# Debugging: Print shapes of key tensors
print("Shapes - statement_embeddings:", statement_embeddings.shape)
print("Shapes - rating_tensor:", rating_tensor.shape)
print("Shapes - edge_index:", edge_index.shape)


# Define your GraphSAGE model
class GraphSAGEModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(GraphSAGEModel, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.convs = nn.ModuleList([SAGEConv(hidden_dim, hidden_dim) for _ in range(num_layers)])
        self.lin = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index):
    # Check if edge_index contains valid node indices
         assert edge_index.max() < x.size(0), "Invalid node index in edge_index"

         x = self.conv1(x, edge_index)
         for conv in self.convs:
              x = conv(x, edge_index)
         x = self.lin(x)
         return x


# Prepare your data
data = Data(x=statement_embeddings, edge_index=edge_index, y=rating_tensor)
loader = DataLoader([data], batch_size=len(statements), shuffle=False)


input_dim = statement_embeddings.shape[1]  # Correct input dimension
model = GraphSAGEModel(input_dim=input_dim, hidden_dim=64, output_dim=1, num_layers=2)

# Define loss and optimizer with an appropriate learning rate
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Adjust the learning rate as needed

# Define the number of training epochs
num_epochs = 100  # Replace with your desired number of epochs

# Training loop
for epoch in range(num_epochs):
    model.train()
    for data in loader:
        optimizer.zero_grad()
        output = model(data.x, data.edge_index)
        loss = criterion(output, data.y.view(-1, 1).float())
        loss.backward()
        optimizer.step()



Shapes - statement_embeddings: torch.Size([173, 100])
Shapes - rating_tensor: torch.Size([424, 1])
Shapes - edge_index: torch.Size([2, 465806])




AssertionError: ignored

In [None]:
!pip install torch_geometric


Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/661.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/661.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sha256=e053c55f308e51bc44f438e0c1e7732e51b1a91ae81d316bf518d8bef222ce4d
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308