In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pickle
!pip install torch_geometric
from torch_geometric.utils.convert import from_networkx

In [None]:
# LOAD GRAPH
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/Challenge/

G = pickle.load(open('graph.pickle', 'rb'))

Mounted at /content/drive
/content/drive/My Drive/Challenge


NameError: name 'pickle' is not defined

In [None]:
# # Add self-loops to each node
# for node in G.nodes():
#     G.add_edge(node, node)  # Add an edge from the node to itself

In [None]:
print('Graph properties')
print('==============================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {G.number_of_nodes()}') #Number of nodes in the graph
print(f'Number of edges: {G.number_of_edges()}') #Number of edges in the graph
print(f'Contains isolated nodes: {len(list(nx.isolates(G)))}') #Does the graph contains nodes that are not connected

Graph properties
Number of nodes: 65208
Number of edges: 1642073
Contains isolated nodes: 0


In [None]:
# LOAD TRAIN DOMAINS
train_domains = pd.read_csv('train_domains.csv')
train_domains = train_domains.rename(columns={'Unnamed: 0':'label', '0':'domain'})
train_domains.head()

Unnamed: 0,label,domain
0,0,autocarnet.gr
1,5,queen.gr
2,5,aggeliorama.gr
3,0,bikerspoint.gr
4,3,athensgo.gr


In [None]:
test_domains = pd.read_csv('test_domains.csv')
test_domains = test_domains.rename(columns={'0':'domain'})
test_domains.head()

Unnamed: 0.1,Unnamed: 0,domain
0,0,startupper.gr
1,1,artware.gr
2,2,oneirokritis.blogspot.gr
3,3,kali-ellada.blogspot.gr
4,4,ote.gr


In [None]:
# ADD LABELS AND MASKS TO GRAPH
from sklearn.model_selection import train_test_split

# Extract node names and labels from the DataFrame
node_names = train_domains['domain'].tolist()
labels = train_domains['label'].tolist()

test_nodes = test_domains['domain'].tolist()

train_nodes, val_nodes, train_labels, val_labels = train_test_split(node_names, labels, test_size=0.2, random_state=42)
node_label_train = dict(zip(train_nodes, train_labels))
node_label_val = dict(zip(val_nodes, val_labels))
node_label_test = test_nodes


for node in G.nodes():
    if node in node_label_train:
        G.nodes[node]['label'] = node_label_train[node]
        G.nodes[node]['train_mask'] = True
        G.nodes[node]['val_mask'] = False
        G.nodes[node]['test_mask'] = False
    elif node in node_label_val:
        G.nodes[node]['label'] = node_label_val[node]
        G.nodes[node]['train_mask'] = False
        G.nodes[node]['val_mask'] = True
        G.nodes[node]['test_mask'] = False

    elif node in node_label_test:
        G.nodes[node]['label'] = False
        G.nodes[node]['train_mask'] = False
        G.nodes[node]['val_mask'] = False
        G.nodes[node]['test_mask'] = True
    else:
        # Handle nodes without labels
        G.nodes[node]['label'] = False
        G.nodes[node]['train_mask'] = False
        G.nodes[node]['val_mask'] = False
        G.nodes[node]['test_mask'] = False

In [None]:
# ADD RANDOM WALK?


In [None]:
import torch
# random walk
data = from_networkx(G)

# adj_matrix = np.array(nx.adjacency_matrix(G).todense())

# # Example of random walk initialization
# def random_walk_initialization(adj_matrix, node_features, num_steps=32):
#     for _ in range(num_steps):
#         new_node_features = np.matmul(adj_matrix, node_features)
#         node_features = new_node_features
#     return node_features

node_features = torch.FloatTensor(np.random.randn(len(G), 32))

# node_features = random_walk_initialization(adj_matrix, node_features)


# node_features = torch.FloatTensor(np.random.randn(len(G), 32))
data.x = node_features
data

Data(edge_index=[2, 1642073], label=[65208], train_mask=[65208], val_mask=[65208], test_mask=[65208], num_nodes=65208, x=[65208, 32])

In [None]:
from torch.nn import Linear
from torch_geometric.nn import GCNConv, global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(32, 32)  # Increased units in the first layer
        self.conv2 = GCNConv(32, 32)  # Increased units in the second layer
        self.conv3 = GCNConv(32, 32)   # Increased units in the third layer
        self.dropout1 = torch.nn.Dropout(0.5)
        self.classifier = Linear(32, 9)

    # def forward(self, x, edge_index):
    #     # First GCN layer
    #     h1 = self.conv1(x, edge_index)
    #     h1 = self.dropout1(h1)
    #     h1 = torch.tanh(h1)  # Apply Tanh nonlinearity

    #     # Second GCN layer with skip connection
    #     h2 = self.conv2(h1, edge_index)
    #     h2 = self.dropout1(h2)
    #     h2 = torch.tanh(h2)  # Apply Tanh nonlinearity
    #     h2 += h1  # Skip connection

    #     # Third GCN layer with skip connection
    #     h3 = self.conv3(h2, edge_index)
    #     h3 = self.dropout1(h3)
    #     h3 = torch.tanh(h3)  # Apply Tanh nonlinearity
    #     # h3 += h2  # Skip connection

    #     # # Graph pooling
    #     # x = global_mean_pool(h3, torch.zeros(h3.size(0), dtype=torch.long))

    #     # # Apply a final (linear) classifier.
    #     out = self.classifier(h3)

    #     return out, h3

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = self.dropout1(h)
        h = h.tanh()
        h = self.conv2(h, edge_index)
        h = self.dropout1(h)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = self.dropout1(h)
        h = h.tanh()  # Final GNN embedding space.

        # Apply a final (linear) classifier.
        out = self.classifier(h)

        return out, h
model = GCN()
print(model)

GCN(
  (conv1): GCNConv(32, 32)
  (conv2): GCNConv(32, 32)
  (conv3): GCNConv(32, 32)
  (dropout1): Dropout(p=0.5, inplace=False)
  (classifier): Linear(in_features=32, out_features=9, bias=True)
)


In [None]:
# TRAIN
criterion = torch.nn.CrossEntropyLoss()  #Initialize the CrossEntropyLoss function.
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Initialize the Adam optimizer.
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
best_val_loss = float('inf')
def train(data):
    optimizer.zero_grad()  # Clear gradients.
    out, h = model(data.x, data.edge_index)  # Perform a single forward pass.
    loss = criterion(out[data.train_mask], data.label[data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.

    return loss, h

def valid(data):
    model.eval()  # Set model to evaluation mode.
    with torch.no_grad():
        out, h = model(data.x, data.edge_index)  # Perform a forward pass.
        loss = criterion(out[data.val_mask], data.label[data.val_mask])  # Compute the loss solely based on the validation nodes.
    return loss, h

for epoch in range(401):
    loss, h = train(data)
    scheduler.step()
    val_loss, h = valid(data)
    print(f'Epoch: {epoch}, Loss: {loss} | val_loss {val_loss.item()}')
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # Save the model weights.
        torch.save(model.state_dict(), 'best_gnn_weights.pth')


Epoch: 0, Loss: 2.1580145359039307 | val_loss 2.146254777908325
Epoch: 1, Loss: 2.1357338428497314 | val_loss 2.136871337890625
Epoch: 2, Loss: 2.126132011413574 | val_loss 2.1280102729797363
Epoch: 3, Loss: 2.1171302795410156 | val_loss 2.1201441287994385
Epoch: 4, Loss: 2.1091840267181396 | val_loss 2.1136038303375244
Epoch: 5, Loss: 2.1026065349578857 | val_loss 2.1085734367370605
Epoch: 6, Loss: 2.097564935684204 | val_loss 2.105079174041748
Epoch: 7, Loss: 2.0940704345703125 | val_loss 2.102985382080078
Epoch: 8, Loss: 2.091979742050171 | val_loss 2.1020123958587646
Epoch: 9, Loss: 2.091008424758911 | val_loss 2.101762056350708
Epoch: 10, Loss: 2.090759038925171 | val_loss 2.101762056350708
Epoch: 11, Loss: 2.090759038925171 | val_loss 2.101511240005493
Epoch: 12, Loss: 2.0905086994171143 | val_loss 2.1005303859710693
Epoch: 13, Loss: 2.0895304679870605 | val_loss 2.0984089374542236
Epoch: 14, Loss: 2.087415933609009 | val_loss 2.094841718673706
Epoch: 15, Loss: 2.0838680267333984

In [None]:
# Load the saved weights.
model.load_state_dict(torch.load('best_gnn_weights.pth'))

<All keys matched successfully>

In [None]:
# PREDICTIONS

In [None]:
import torch.nn.functional as F

def test(data):
    model.eval()  # Set model to evaluation mode.
    with torch.no_grad():
        out, _ = model(data.x, data.edge_index)  # Perform a forward pass.
    return out[data.test_mask]

# Assuming you have test_mask set up similarly to train_mask and val_mask
test_predictions = F.softmax(test(data), dim=1)

In [None]:
test_predictions.sum(axis=1).shape

torch.Size([605])

In [None]:
test_domains['domain']

0                 startupper.gr
1                    artware.gr
2      oneirokritis.blogspot.gr
3       kali-ellada.blogspot.gr
4                        ote.gr
                 ...           
600                     rise.gr
601               relaxstrom.gr
602                airliners.gr
603                   palema.gr
604                 protagon.gr
Name: domain, Length: 605, dtype: object

In [None]:
df = pd.read_csv('predictions_7.csv')
df.head()

In [None]:
df_2 = pd.read_csv

In [None]:
subset_df = df[df.columns[1:]]  # Subset of columns from the second column onwards in df

# Make sure the order of columns in test_predictions matches subset_df
test_predictions = pd.DataFrame(test_predictions.numpy())
test_predictions.columns = subset_df.columns

result = (subset_df + test_predictions)/2


In [None]:
result

Unnamed: 0,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8
0,0.006254,0.002381,0.002590,0.466120,0.054776,0.367228,0.008464,0.066997,0.025190
1,0.024806,0.013008,0.058110,0.617625,0.019673,0.157590,0.043756,0.033308,0.032125
2,0.000925,0.002458,0.297639,0.469088,0.002997,0.143155,0.014475,0.001979,0.067284
3,0.001152,0.026348,0.030690,0.822514,0.006005,0.077818,0.002295,0.001578,0.031600
4,0.024025,0.039029,0.078024,0.184253,0.075699,0.260233,0.042483,0.087692,0.208562
...,...,...,...,...,...,...,...,...,...
600,0.017404,0.018052,0.436269,0.103486,0.030948,0.116907,0.103389,0.003219,0.170327
601,0.007386,0.433802,0.019291,0.034994,0.008822,0.032740,0.351991,0.021523,0.089453
602,0.037151,0.019876,0.088776,0.071533,0.019854,0.199131,0.141830,0.082338,0.339509
603,0.160410,0.059060,0.102414,0.448964,0.008850,0.198452,0.009024,0.009508,0.003317


In [None]:
import csv

# Write predictions to a file
with open('/content/drive/My Drive/Challenge/predictions_GNN_New2.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(9):
        lst.append('class_'+str(i))
    lst.insert(0, "domain_name")
    writer.writerow(lst)
    for i,test_host in enumerate(test_domains['domain']):
        lst = result.iloc[i].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

In [None]:
# NOTHING HERE

In [None]:
test_domains = pd.read_csv('test_domains.csv')
test_domains = test_domains.rename(columns={'0':'domain'})
test_domains.head()

Unnamed: 0.1,Unnamed: 0,domain
0,0,startupper.gr
1,1,artware.gr
2,2,oneirokritis.blogspot.gr
3,3,kali-ellada.blogspot.gr
4,4,ote.gr


In [None]:
train_domains = pd.read_csv('train_domains.csv')
train_domains = train_domains.rename(columns={'Unnamed: 0':'label', '0':'domain'})
train_domains.head()

Unnamed: 0,label,domain
0,0,autocarnet.gr
1,5,queen.gr
2,5,aggeliorama.gr
3,0,bikerspoint.gr
4,3,athensgo.gr


In [None]:
from sklearn.model_selection import train_test_split

# Extract node names and labels from the DataFrame
node_names = train_domains['domain'].tolist()
labels = train_domains['label'].tolist()

test_nodes = test_domains['domain'].tolist()

train_nodes, val_nodes, train_labels, val_labels = train_test_split(node_names, labels, test_size=0.2, random_state=42)

In [None]:
# def domain_match(domain):
#     if domain in val_nodes:
#         return 2
#     elif domain in test_nodes:
#         return 3
#     elif domain in train_nodes:
#         return 1
#     else:
#         return 0

# # Create a new column based on domain matching
# df['set'] = df['domain'].apply(lambda x: domain_match(x))

# df.head()
for node in G.nodes():
    if node in train_nodes:
        G.nodes[node]['set'] = 3
    elif node in val_nodes:
        G.nodes[node]['set'] = 2

    elif node in train_nodes:
        G.nodes[node]['label'] = 1
    else:
        # Handle nodes without labels
        G.nodes[node]['set'] = False



KeyError: 'domain'

In [None]:
import pandas as pd

In [None]:
df2 = pd.read_csv('predictions_GNN_New2.csv')
df2.head()

Unnamed: 0,domain_name,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8
0,startupper.gr,0.006254,0.002381,0.00259,0.46612,0.054776,0.367228,0.008464,0.066997,0.02519
1,artware.gr,0.024806,0.013008,0.05811,0.617625,0.019673,0.15759,0.043756,0.033308,0.032125
2,oneirokritis.blogspot.gr,0.000925,0.002458,0.297639,0.469088,0.002997,0.143155,0.014475,0.001979,0.067284
3,kali-ellada.blogspot.gr,0.001152,0.026348,0.03069,0.822514,0.006005,0.077818,0.002295,0.001578,0.0316
4,ote.gr,0.024025,0.039029,0.078024,0.184253,0.075699,0.260233,0.042483,0.087692,0.208562


In [None]:
filtered_df2 = df2[(df2.iloc[:, 1:] > 0.7).any(axis=1)]

print(filtered_df2.shape)


(59, 10)


In [None]:
argmax_result2 = filtered_df2.iloc[:, 1:].apply(lambda row: row.argmax(), axis=1)

argmax_result2.value_counts()

3    52
8     2
1     2
4     1
5     1
2     1
Name: count, dtype: int64

In [None]:
df = pd.read_csv('predictions_7.csv')
df.head()

Unnamed: 0,domain_name,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8
0,startupper.gr,0.00953,0.001214,0.002311,0.332654,0.029293,0.613631,0.004534,0.001761,0.005072
1,artware.gr,0.049594,0.025334,0.113291,0.264153,0.016433,0.313139,0.087447,0.06636,0.064248
2,oneirokritis.blogspot.gr,0.001839,0.004274,0.585436,0.153168,0.003775,0.089922,0.026745,0.00184,0.133
3,kali-ellada.blogspot.gr,0.002301,0.052638,0.059873,0.681441,0.010007,0.123799,0.004516,0.002278,0.063146
4,ote.gr,0.044281,0.07792,0.137659,0.295297,0.054534,0.160431,0.058446,0.059834,0.111598


In [None]:
filtered_df = df[(df.iloc[:, 1:] > 0.7).any(axis=1)]

# print(filtered_df[1:].head())

Unnamed: 0,domain_name,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8
6,fytro.com.gr,0.004253,0.017430,0.013221,0.105677,0.004245,0.017671,0.733748,0.006617,0.097139
8,toeidesauto.blogspot.gr,0.002170,0.001013,0.018246,0.954319,0.002279,0.017855,0.001831,0.001549,0.000738
11,tripadvisor.com.gr,0.007670,0.003149,0.032912,0.001648,0.008928,0.077176,0.009603,0.841468,0.017447
15,airtickets.gr,0.002302,0.003842,0.033590,0.018956,0.004771,0.004917,0.003866,0.922793,0.004964
17,eap.gr,0.003114,0.002355,0.003659,0.041943,0.928660,0.014009,0.001622,0.002893,0.001744
...,...,...,...,...,...,...,...,...,...,...
594,proinanea.gr,0.001020,0.004277,0.001229,0.972236,0.012628,0.005331,0.000872,0.001498,0.000911
598,michanikos.gr,0.005392,0.003599,0.002245,0.723049,0.025496,0.223318,0.004178,0.007846,0.004877
601,relaxstrom.gr,0.013782,0.021333,0.023250,0.013037,0.007350,0.027111,0.703298,0.014164,0.176675
603,palema.gr,0.008778,0.110982,0.005919,0.801812,0.010995,0.034001,0.002915,0.018268,0.006330


In [None]:
# max_index = filtered_df.iloc[:, 1:].idxmax(axis=1)

# print(max_index)
argmax_result = filtered_df.iloc[:, 1:].apply(lambda row: row.argmax(), axis=1)


5      3
6      6
8      3
11     7
15     7
      ..
594    3
598    3
601    6
603    3
604    3
Length: 275, dtype: int64


In [None]:
argmax_result.value_counts()

3    135
2     30
5     30
8     21
4     18
1     16
7     14
6      7
0      4
Name: count, dtype: int64