In [1]:
import pandas as pd
import numpy as np
import networkx as nx

In [2]:
edges = pd.read_csv('../data/twitch_gamers/large_twitch_edges.csv')
edges

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118
...,...,...
6797552,97507,29359
6797553,71175,12020
6797554,151702,128281
6797555,118034,38021


In [5]:
nodes = pd.read_csv('../data/twitch_gamers/large_twitch_features.csv')
nodes

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0
...,...,...,...,...,...,...,...,...,...
168109,4965,0,810,2016-07-20,2018-10-08,168109,0,EN,0
168110,4128,1,2080,2013-01-31,2018-10-12,168110,0,EN,0
168111,3545,0,1797,2013-11-08,2018-10-10,168111,0,EN,1
168112,892736,1,2135,2012-12-07,2018-10-12,168112,0,EN,0


In [6]:
from sklearn.preprocessing import StandardScaler
import torch

X = np.asarray(nodes[['numeric_id', 'views', 'life_time', 'dead_account', 'affiliate']].values)
y = np.asarray(nodes[['mature']].values).ravel()

G = nx.from_pandas_edgelist(edges, source='numeric_id_1', target='numeric_id_2')

In [8]:
G.number_of_nodes()

168114

In [None]:
# pos = nx.spring_layout(G, seed=42)
# nx.draw_networkx_edges(G, pos, alpha=0.2)
# nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_size=1)

In [9]:
adj = nx.to_scipy_sparse_array(G).tocoo()
row = torch.from_numpy(adj.row.astype(np.int64))
col = torch.from_numpy(adj.col.astype(np.int64))
edge_index = torch.stack([row, col], dim=0)

In [10]:
print(X.shape)
print(y.shape)
print(edge_index.shape)

(168114, 5)
(168114,)
torch.Size([2, 13595114])


In [11]:

from sklearn.model_selection import train_test_split
from torch_geometric.data import Data, InMemoryDataset
from torch_geometric.loader import DataLoader

class CustomDataset(InMemoryDataset):
    def __init__(self, transform=None):
        super(CustomDataset, self).__init__('.', transform, None, None)

        data = Data(x=X, y=y, edge_index=edge_index)
        data.x = torch.from_numpy(X).type(torch.float32)
        data.y = torch.from_numpy(y).type(torch.long)
        data.num_classes = 2

        # splitting the data into train, validation and test
        X_train, X_test, y_train, y_test = train_test_split(pd.Series(X[:,0]),pd.Series(y),test_size=0.30,random_state=42)

        # create train and test masks for data
        train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        train_mask[X_train.index] = True
        test_mask[X_test.index] = True
        data['train_mask'] = train_mask
        data['test_mask'] = test_mask

        self.data, self.slices = self.collate([data])


dataset = CustomDataset()
data = dataset[0]


In [15]:
data.num_classes

2

In [16]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# GCN model with 2 layers
class Net(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [None]:
# from torch_geometric.datasets import Planetoid
#
# dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [18]:
from torch import optim
import torch.nn.functional as F

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = dataset[0].to(device)

# Define the GNN model
input_dim = dataset.num_node_features  # Number of input features
hidden_dim = 64  # Number of hidden units
output_dim = dataset.num_classes  # Number of output classes
model = Net(input_dim, output_dim).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
# criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


num_epochs = 100
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    # Zero the gradients
    optimizer.zero_grad()
    # Forward pass
    outputs = model(data)

    # Compute loss
    loss = criterion(outputs[data.train_mask], data.y[data.train_mask])

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

    epoch_loss = running_loss / len(data.y[data.train_mask])
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

    # # Evaluate the model
    # accuracy = evaluate(model, dataloader)
    # print(f'Accuracy on the validation set: {accuracy:.2%}')

Epoch 1/100, Loss: 0.2746
Epoch 2/100, Loss: 0.2542
Epoch 3/100, Loss: 0.2501
Epoch 4/100, Loss: 0.2535
Epoch 5/100, Loss: 0.2415
Epoch 6/100, Loss: 0.2402
Epoch 7/100, Loss: 0.2372
Epoch 8/100, Loss: 0.2142
Epoch 9/100, Loss: 0.2165
Epoch 10/100, Loss: 0.2081
Epoch 11/100, Loss: 0.1938
Epoch 12/100, Loss: 0.1948
Epoch 13/100, Loss: 0.2038
Epoch 14/100, Loss: 0.1805
Epoch 15/100, Loss: 0.1768
Epoch 16/100, Loss: 0.1704
Epoch 17/100, Loss: 0.1682
Epoch 18/100, Loss: 0.1534
Epoch 19/100, Loss: 0.1510
Epoch 20/100, Loss: 0.1437
Epoch 21/100, Loss: 0.1345
Epoch 22/100, Loss: 0.1243
Epoch 23/100, Loss: 0.1179
Epoch 24/100, Loss: 0.1193
Epoch 25/100, Loss: 0.1180
Epoch 26/100, Loss: 0.1073
Epoch 27/100, Loss: 0.1012
Epoch 28/100, Loss: 0.0974
Epoch 29/100, Loss: 0.0943
Epoch 30/100, Loss: 0.0892
Epoch 31/100, Loss: 0.0846
Epoch 32/100, Loss: 0.0774
Epoch 33/100, Loss: 0.0722
Epoch 34/100, Loss: 0.0710
Epoch 35/100, Loss: 0.0665
Epoch 36/100, Loss: 0.0656
Epoch 37/100, Loss: 0.0656
Epoch 38/1

In [19]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.5114
