In [1]:
# Import torch & Check CUDA availability
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

True
1
0


In [2]:
# Get CUDA device name
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x7f18f9353dd0>
NVIDIA A100-SXM4-80GB


## Twitter Dataset

### Read text file first

In [3]:
# Function for reading dataset file
def read_file_until_tab_newline(file):
    buffer = ''
    
    returnMode = 0
    while True:
        byte = file.read(1)
        if not byte:
            returnMode = 0
            break
        else:
            if byte == "\t" or byte == "\n":
                returnMode = 1
                break
            buffer += byte
    
    if (returnMode == 0):
        return False
    else:
        return buffer

In [4]:
# Read the file and store the dataset
file = open("/mnt/ephemeral/gnn/dataset/Twitter/twitter_rv.net", "r")

edges = []
while True:
    value = read_file_until_tab_newline(file)
    if not value:
        print(f'Storing edge_index finished!')
        break
    else:
        value = int(value)
        edges.append(value)
        
file.close()

Storing edge_index finished!


In [5]:
print(len(edges))
print(max(edges))
print(min(edges))

2936730364
61578414
12


### Edge index

In [6]:
import numpy as np

# Change edge_index list to numpy array first
edges = np.array(edges)
print(edges.shape)

(2936730364,)


In [7]:
# Get the nodes
nodes = np.unique(edges)
nodes.sort()
print(nodes.shape)
print(nodes[0])
print(nodes[41652229])

(41652230,)
12
61578414


In [8]:
# Generate the modified node 
m_nodes=[]
for i in range(41652230):
    m_nodes.append(i)

m_nodes = np.array(m_nodes)
nodes = np.stack((nodes, m_nodes), axis=1)

In [9]:
print(nodes.shape)
print(nodes[0])
print(nodes[0][0])
print(nodes[0][1])

(41652230, 2)
[12  0]
12
0


In [10]:
import pandas as pd

# Change the edge index numpy array to DataFrame
edges = pd.DataFrame(edges)

In [None]:
# Change the original nodes to modified nodes
for i in range(41652230):
    original = nodes[i][0]
    modified = nodes[i][1]
    edges.replace(original, modified)
    if i % 1000 == 0:
        print(i)

0


In [None]:
# Change edge_index DataFrame to numpy array again
edges = edges.to_numpy()

In [6]:
# Reshape edge_index numpy array to 2-dimensional
edges = edges.reshape(-1, 2)
print(edges.shape)

(2936730364,)
(1468365182, 2)


In [10]:
# Make edge_index as tensor for using it on PyTorch Geometric
# dtype should be torch.int64
import torch_geometric

edges = torch.tensor(edges, dtype=torch.int64)
print(edges.shape)

torch.Size([2405026390, 2])


### Node feature matrix

In [11]:
# Make node feature matrix by our own
# 41652230(#nodes) x 16(#features)
import random

x=[]
tmp = []
for i in range(41652230):
    for j in range(16):
        r = random.uniform(-2.5, 2.5)
        while r in tmp:
            r = random.uniform(-2.5, 2.5)
        tmp.append(r)
    x.extend(tmp)
    tmp.clear()

print(len(x))

666435680


In [12]:
# Change node feature matrix(list) to numpy array first
x = np.array(x)
print(x.shape)
# Reshape node feature matrix(numpy array) to 2-dimensional
x = x.reshape(-1, 16)
print(x.shape)

(666435680,)
(41652230, 16)


In [13]:
# Make node feature matrix as tensor for using it on PyTorch Geometric
# dtype should be torch.float32
x = torch.tensor(x, dtype=torch.float32)
print(x.shape)

torch.Size([41652230, 16])


### Ground-truth labels

In [14]:
# Make ground-truth labels by our own
y=[]
for i in range(41652230):
    r = random.randrange(0, 16)
    y.append(r)

print(len(y))

41652230


In [15]:
# Make ground-truth lables as tensor for using it on PyTorch Geometric
# dtype should be torch.int64
y = torch.tensor(y, dtype=torch.int64)
print(y.shape)

torch.Size([41652230])


### Make all components as PyTorch dataset

In [16]:
# Make node feature matrix, edge index, ground-truth labels as PyTorch Dataset
from torch_geometric.data import Data
data = Data(x=x, edge_index=edges.t().contiguous(), y=y)
print(data)

Data(x=[41652230, 16], edge_index=[2, 2405026390], y=[41652230])


In [None]:
print('The current dataset is undirected:', data.is_undirected())

### Save the PyTorch dataset

In [15]:
torch.save(data, "/mnt/ephemeral/gnn/dataset/Twitter/twitter.pt")

In [4]:
data = torch.load("/mnt/ephemeral/gnn/dataset/Twitter/twitter.pt")
print(data)

Data(x=[41652230, 16], edge_index=[2, 2405026390], y=[41652230])
