In [None]:
import numpy as np
import pandas as pd

import networkx as nx

## Setting up graph

In [None]:
node_df = pd.read_csv('/kaggle/input/ue20cs344-nam-assignment-2/node_data.csv')
edge_df = pd.read_csv('/kaggle/input/ue20cs344-nam-assignment-2/edgelist.csv')

# edge_df.head()

In [None]:
node_df.head()

In [None]:
G = nx.Graph()

# Add nodes to the graph
for i, row in node_df.iterrows():
    G.add_node(row['id'], **row.to_dict())

# Add edges to the graph
for i, row in edge_df.iterrows():
    G.add_edge(row['source'], row['target'], **row.to_dict())

In [None]:
print(G)

## Node2Vec

In [None]:
!pip install node2vec

In [None]:
from node2vec import Node2Vec

# Generate node2vec features for the graph
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
nfeatures = {str(node): values for node, values in zip(G.nodes(), node2vec)}

# Save node2vec features to a file
with open('features1.pt', 'w') as f:
    json.dump(nfeatures, f)

## GNN

In [None]:
!pip install torch-geometric

In [252]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

In [253]:
# Load node features from file
node_features = torch.load('/kaggle/input/ue20cs344-nam-assignment-2/features.pt')

# Convert networkx graph to PyTorch Geometric graph
edge_index = torch.tensor(list(G.edges())).t().contiguous()
node_indices = [node_index[node_id] for node_id in G.nodes()]
node_feature_list = [node_features[idx] for idx in node_indices]
x = torch.stack(node_feature_list)
y = torch.tensor([1 if G.nodes[node]['branch'] == "CSE" 
                  else 2 if G.nodes[node]['branch'] == "ECE" 
                  else 3 if G.nodes[node]['branch'] == "EEE" 
                  else 0 for node in G.nodes()])
data = Data(x=x, edge_index=edge_index, y=y)


In [254]:
import torch
from torch_geometric.data import Data
from torch.utils.data import Subset, DataLoader

# Define the indices of the nodes with desired branch values
train_indices = [i for i in range(19717) if data.y[i] in [1, 2, 3]]
test_indices = [i for i in range(19717) if data.y[i] == 0]

# Create Subset objects for training and testing data
train_data = Subset(data, train_indices)
test_data = Subset(data, test_indices)

# Print the number of nodes in training and testing subsets
print(f"Number of nodes in training set: {len(train_data)}")
print(f"Number of nodes in testing set: {len(test_data)}")

# Create PyTorch DataLoader objects for training and testing subsets
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

Number of nodes in training set: 15773
Number of nodes in testing set: 3944


In [255]:
train_graph = nx.Graph()

# Add nodes to the graph
for i, row in node_df.iterrows():
    if(row['id'] in train_indices):
        train_graph.add_node(row['id'], **row.to_dict())

# Add edges to the graph
for i, row in edge_df.iterrows():
    if(row['source'] in train_indices and row['target'] in train_indices):
        train_graph.add_edge(row['source'], row['target'], **row.to_dict())

In [256]:
edge_index1 = torch.tensor(list(train_graph.edges())).t().contiguous()

In [257]:
from torch.utils.data import random_split

train_indices = [i for i in range(19717) if data.y[i] in [1, 2, 3]]
test_indices = [i for i in range(19717) if data.y[i] == 0]

train_data = Subset(data, train_indices)
test_data = Subset(data, test_indices)

In [258]:
train_x = data.x[train_data.indices]
train_y = data.y[train_data.indices]

In [259]:
#data = Data(x=train_x, edge_index=edge_index1, y=train_y)

In [260]:
#Define GCN model
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(node_features.shape[1], 16)
        self.conv2 = GCNConv(16, 4)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index1))
        x = self.conv2(x, edge_index1)
        return x

# Train GCN model
model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()



In [261]:
# Predict on test nodes
model.eval()
out = model(data.x, data.edge_index)
pred = out[test_mask].argmax(dim=1)


## Sorting out the output

In [262]:
tmp = pred.tolist()

In [263]:
result = []
index = []

In [264]:
for i in range(len(tmp)):
    if i in test_indices:
        index.append(i)
        if(tmp[i]==1):
            result.append("CSE")
        elif(tmp[i]==2):
            result.append("ECE")
        else:
            result.append("EEE")

In [265]:
len(index)

3924

In [266]:
df = pd.DataFrame(result, index = index)
df.head()

Unnamed: 0,0
4,EEE
22,EEE
29,ECE
32,CSE
34,ECE


In [267]:
df.to_csv("Output.csv")