In [1]:
import pandas as pd
import torch
import numpy as np
from torch_geometric.data import Data
import networkx as nx
import os

from torch_geometric.utils import from_scipy_sparse_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import codecs
from os import path

import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# With the tfidf vectorizer

In [297]:
# Read training data
with open("train.csv", 'r') as f:
    train_data = f.read().splitlines()

train_hosts = list()
y_train = list()
for row in train_data:
    host, label = row.split(",")
    train_hosts.append(host)
    y_train.append(label.lower())

# Read test data
with open("test.csv", 'r') as f:
    test_hosts = f.read().splitlines()

# Load the textual content of a set of webpages for each host into the dictionary "text". 
# The encoding parameter is required since the majority of our text is french.
text = dict()
filenames = os.listdir('text/text')
for filename in filenames:
    with codecs.open(path.join('text/text/', filename), encoding='latin-1') as f: 
        text[filename] = f.read().replace("\n", "").lower()

train_data = list()
for host in train_hosts:
    if host in text:
        train_data.append(text[host])
    else:
        train_data.append('')

# Create the training matrix. Each row corresponds to a web host and each column to a word present in at least 10 web
# hosts and at most 1000 web hosts. The value of each entry in a row is equal to the tf-idf weight of that word in the 
# corresponding web host       

vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', encoding='latin-1', min_df=10, max_df=1000)
X_train = vec.fit_transform(train_data)

# Get textual content of web hosts of the test set
test_data = list()
for host in test_hosts:
    if host in text:
        test_data.append(text[host])
    else:
        test_data.append('')

# Create the test matrix following the same approach as in the case of the training matrix
X_test = vec.transform(test_data)

In [298]:
X_test = np.array(X_test.todense())
X_train = np.array(X_train.todense())

In [278]:
print(X_test.shape, X_train.shape)

(560, 21384) (2125, 21384)


In [321]:
classes = ['business/finance','education/research','entertainment','health/medical','news/press','politics/government/law','sports','tech/science']

classes_to_label = {'business/finance' : 0,'education/research':1,'entertainment':2,'health/medical':3,'news/press':4,
'politics/government/law':5,'sports':6,'tech/science' : 7}

emb = [X_train[i] for i in range(len(X_train))] + [X_train[i] for i in range(len(X_test))]
hosts = train_hosts + test_hosts
labels = [classes_to_label[label] for label in y_train] + [-1 for i in test_hosts]
mask = [0 for i in y_train[:-200]] + [1 for i in range(200)] + [-1 for i in test_hosts]

In [322]:
df_graph_method = pd.DataFrame({'emb': emb, 'label': labels,'host' :hosts, 'mask' : mask })
df_graph_method.to_pickle('emb_tfidf_for_graph_method')

In [323]:
X_test[0]

array([0., 0., 0., ..., 0., 0., 0.])

# Load the data and put it in a torch geometric format

## transformer lstm emb

In [16]:
df_emb = pd.read_pickle('emb_lstm_for_graph_method')
df_emb.drop_duplicates('host',inplace = True) # delete duplicate
df_emb.reset_index(inplace = True)
df_emb.drop('index',axis = 1, inplace =True)
num_features = len(df_emb['emb'][0])
print(num_features)

200


## tfidf emb

In [324]:
df_emb = pd.read_pickle('emb_tfidf_for_graph_method')
df_emb.drop_duplicates('host',inplace = True) # delete duplicate
df_emb.reset_index(inplace = True)
df_emb.drop('index',axis = 1, inplace =True)
num_features = len(df_emb['emb'][0])
print(num_features)

21384


# put it in a torch geometric format

In [17]:
df_emb

Unnamed: 0,emb,label,host,mask
0,"[0.00012147931, -0.7683938, -0.09987722, 0.002...",0,7587,0
1,"[-0.026015002, 0.53278214, 0.0006760344, 0.001...",2,16150,0
2,"[-0.0029351865, -0.01593118, -0.002883664, -0....",0,9841,0
3,"[0.00035103768, 0.5212322, 0.0019309241, 0.013...",0,6441,0
4,"[0.0009134803, 0.508965, 0.010666804, 0.098105...",2,8533,0
...,...,...,...,...
2549,"[-0.006148623, 0.7345553, 0.02454278, 0.091676...",-1,23783,-1
2550,"[0.0001064896, 0.6425214, -0.016132945, 0.0028...",-1,16792,-1
2551,"[-0.041136276, 0.34874263, 0.0041652224, 0.060...",-1,6584,-1
2552,"[7.500023e-05, 0.574721, -0.0022576374, 0.0076...",-1,13527,-1


In [34]:
x = torch.tensor(np.vstack(df_emb['emb']), dtype = torch.float)

In [35]:
nodelist = [str(host) for host in df_emb['host']]

In [36]:
labels = torch.tensor(list(df_emb['label']),dtype = torch.long)

In [37]:
G = nx.read_weighted_edgelist('edgelist.txt', create_using=nx.DiGraph())

In [38]:
edge_index = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, dtype=None, weight='weight', format='csr')

In [39]:
edge_index = from_scipy_sparse_matrix(edge_index)

In [40]:
edge_index, edge_attribute = edge_index[0], torch.tensor(edge_index[1],dtype = torch.float64)

  """Entry point for launching an IPython kernel.


In [41]:
dataset = Data(x=x, edge_index=edge_index,y = labels, edge_attr = edge_attribute)

train_idx = [index for index in range(len(df_emb)) if df_emb['mask'][index] == 0]
dataset.train_idx = torch.tensor(train_idx, dtype= torch.long)

test_mask = [df_emb['mask'][index] == 1 for index in range(len(df_emb)) ]
dataset.test_mask = torch.tensor(test_mask, dtype=torch.bool)

test_idx = [index for index in range(len(df_emb)) if df_emb['mask'][index] == 1]
dataset.test_idx = torch.tensor(test_idx, dtype= torch.long)


In [42]:
dataset.train_idx.shape

torch.Size([1812])

In [43]:
data.edge_attr

tensor([  2.,   2.,   2.,  ...,   2.,   1., 908.], device='cuda:0',
       dtype=torch.float64)

# GCN

In [46]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, 16)
        self.conv3 = GCNConv(16, 8)
        

    def forward(self, data):
        x, edge_index, edge_attribute= data.x, data.edge_index, data.edge_attr

        x = self.conv1(x, edge_index)#, edge_attribute)
        x = F.relu(x)
        x = F.dropout(x, p=0.,training=self.training)
        x = self.conv2(x, edge_index)#, edge_attribute)
        x = F.relu(x)
        x = F.dropout(x,p=0., training=self.training)
        x = self.conv3(x, edge_index)#,edge_attribute)
        


        return F.softmax(x, dim=1)

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(100000):
    optimizer.zero_grad()
    out = model(data)
    loss = F.cross_entropy(out[data.train_idx], data.y[data.train_idx])
    loss.backward()
    optimizer.step()
    
    if epoch%1000 == 0:
        print(loss.data)
        val_loss = F.cross_entropy(out[data.test_idx], data.y[data.test_idx])
        print(val_loss.data)
    
    

tensor(2.0936, device='cuda:0')
tensor(2.0992, device='cuda:0')
tensor(1.6020, device='cuda:0')
tensor(1.7427, device='cuda:0')
tensor(1.6064, device='cuda:0')
tensor(1.7366, device='cuda:0')
tensor(1.5973, device='cuda:0')
tensor(1.7446, device='cuda:0')
tensor(1.5961, device='cuda:0')
tensor(1.7463, device='cuda:0')
tensor(1.5960, device='cuda:0')
tensor(1.7456, device='cuda:0')
tensor(1.5959, device='cuda:0')
tensor(1.7445, device='cuda:0')
tensor(1.5957, device='cuda:0')
tensor(1.7448, device='cuda:0')
tensor(1.5956, device='cuda:0')
tensor(1.7450, device='cuda:0')
tensor(1.6012, device='cuda:0')
tensor(1.7409, device='cuda:0')
tensor(1.5951, device='cuda:0')
tensor(1.7427, device='cuda:0')
tensor(1.5951, device='cuda:0')
tensor(1.7436, device='cuda:0')
tensor(1.5933, device='cuda:0')
tensor(1.7433, device='cuda:0')
tensor(1.5922, device='cuda:0')
tensor(1.7463, device='cuda:0')
tensor(1.5919, device='cuda:0')
tensor(1.7463, device='cuda:0')
tensor(1.5921, device='cuda:0')
tensor(1

In [211]:
dataset.y[dataset.train_idx]

tensor([0, 2, 0,  ..., 2, 1, 0], dtype=torch.int32)

In [49]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.5220
