# Graph Neural Network Topic Classifier

In the following we will focus on building a model for topic classification based on a Graph Neural Network approach.

In particular in the following we will show you how to:

* Create a TF-IDF representation of the corpus, that will be used as node features in the Graph Neural Network model 
* Build, train a Graph Neural Network model and identify the best threshold for classifying documents 
* Test the performance of the model in a out-of-sample tests, following a truly inductive approach 

**NOTE: This Notebook can only be run after the 01_nlp_graph_creation notebook, as some of the results computed in the first notebook will be here reused.**

### Load Dataset

In [1]:
import nltk 

In [2]:
import numpy as np
import pandas as pd
import networkx as nx

In [3]:
corpus = pd.read_pickle("corpus.p")

In [4]:
corpus.head()

Unnamed: 0_level_0,clean_text,label,language,parsed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test/14826,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,[trade],en,"(ASIAN, EXPORTERS, FEAR, DAMAGE, FROM, U.S.-JA..."
test/14828,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,[grain],en,"(CHINA, DAILY, SAYS, VERMIN, EAT, 7, -, 12, PC..."
test/14829,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,"[crude, nat-gas]",en,"(JAPAN, TO, REVISE, LONG, -, TERM, ENERGY, DEM..."
test/14832,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER Th...,"[corn, grain, rice, rubber, sugar, tin, trade]",en,"(THAI, TRADE, DEFICIT, WIDENS, IN, FIRST, QUAR..."
test/14833,INDONESIA SEES CPO PRICE RISING SHARPLY Indon...,"[palm-oil, veg-oil]",en,"(INDONESIA, SEES, CPO, PRICE, RISING, SHARPLY,..."


In [5]:
from collections import Counter
topics = Counter([label for document_labels in corpus["label"] for label in document_labels]).most_common(10)

In [6]:
topics

[('earn', 3964),
 ('acq', 2369),
 ('money-fx', 717),
 ('grain', 582),
 ('crude', 578),
 ('trade', 485),
 ('interest', 478),
 ('ship', 286),
 ('wheat', 283),
 ('corn', 237)]

In [7]:
topicsList = [topic[0] for topic in topics]
topicsSet = set(topicsList)
dataset = corpus[corpus["label"].apply(lambda x: len(topicsSet.intersection(x))>0)]

In [8]:
def get_labels(corpus, topicsList=topicsList):
    return corpus["label"].apply(
        lambda labels: pd.Series({label: 1 for label in labels}).reindex(topicsList).fillna(0)
    )[topicsList]

In [9]:
labels = get_labels(dataset)

In [10]:
labels.head()

Unnamed: 0_level_0,earn,acq,money-fx,grain,crude,trade,interest,ship,wheat,corn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
test/14826,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
test/14828,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
test/14829,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
test/14832,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
test/14839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
def get_features(corpus):
    return corpus["parsed"]

In [12]:
def get_features_and_labels(corpus):
    return get_features(corpus), get_labels(corpus)

In [13]:
def train_test_split(corpus):
    train_idx = [idx for idx in corpus.index if "training/" in idx]
    test_idx = [idx for idx in corpus.index if "test/" in idx]
    return corpus.loc[train_idx], corpus.loc[test_idx]

In [14]:
train, test = train_test_split(dataset)

In [15]:
def my_spacy_tokenizer(pos_filter=["NOUN", "VERB", "PROPN"]):
    def tokenizer(doc):
        return [token.lemma_ for token in doc if (pos_filter is None) or (token.pos_ in pos_filter)] 
    return tokenizer

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
cntVectorizer = TfidfVectorizer(
    analyzer=my_spacy_tokenizer(),
    max_df = 0.25, min_df = 2, max_features = 10000
)

In [18]:
trainFeatures, _ = get_features_and_labels(train)
testFeatures, _ = get_features_and_labels(test)


In [19]:
trainedTransformed = cntVectorizer.fit_transform(trainFeatures)
testTransformed = cntVectorizer.transform(testFeatures)

In [20]:
features = pd.concat([
    pd.DataFrame.sparse.from_spmatrix(trainedTransformed, index=trainFeatures.index), 
    pd.DataFrame.sparse.from_spmatrix(testTransformed, index=testFeatures.index)
])

In [21]:
features.shape

(9034, 10000)

Creating the Graph

In [22]:
import stellargraph as sg
from stellargraph import StellarGraph, IndexedArray
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE

from tensorflow.keras import layers, optimizers, losses, metrics, Model

2024-11-16 22:50:43.187158: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-16 22:50:43.187173: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-11-16 22:50:46.819716: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2024-11-16 22:50:46.819732: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2024-11-16 22:50:46.819742: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pelican): /proc/driver/nvidia/version does not exist
2024-11-16 22:50:46.820221: I tensorflow/core/platform/cpu_feature_gu

In [23]:
edges = pd.read_pickle("bipartiteEdges.p")

In [24]:
entityTypes = {entity: ith for ith, entity in enumerate(edges["type"].unique())}

In [25]:
entityTypes

{'keywords': 0, 'GPE': 1, 'ORG': 2, 'PERSON': 3}

In [26]:
documentFeatures = features.loc[list(set(corpus.index).intersection(features.index))] #.assign(document=1, entity=0)

In [27]:
documentFeatures.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
training/9850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/6208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
test/18325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
entities = edges.groupby(["target", "type"])["source"].count().groupby(level=0).apply(
    lambda s: s.droplevel(0).reindex(entityTypes.keys()).fillna(0)
).unstack(level=1)

In [29]:
entityFeatures = (entities.T / entities.sum(axis=1)).T.assign(document=0, entity=1)

In [30]:
nodes = {"entity": entityFeatures, 
         "document": documentFeatures}

In [31]:
stellarGraph = StellarGraph(nodes, 
                            edges[edges["source"].isin(documentFeatures.index)], 
                            edge_type_column="type")

In [32]:
print(stellarGraph.info())

StellarGraph: Undirected multigraph
 Nodes: 24177, Edges: 87658

 Node types:
  entity: [15143]
    Features: float32 vector, length 6
    Edge types: entity-GPE->document, entity-ORG->document, entity-PERSON->document, entity-keywords->document
  document: [9034]
    Features: float32 vector, length 10000
    Edge types: document-GPE->entity, document-ORG->entity, document-PERSON->entity, document-keywords->entity

 Edge types:
    document-keywords->entity: [78828]
        Weights: range=[0.0827011, 1], mean=0.258479, std=0.0898449
        Features: none
    document-ORG->entity: [4275]
        Weights: range=[2, 24], mean=3.33427, std=2.38695
        Features: none
    document-GPE->entity: [3141]
        Weights: range=[2, 26], mean=3.1958, std=2.03227
        Features: none
    document-PERSON->entity: [1414]
        Weights: range=[2, 18], mean=3.17327, std=1.97911
        Features: none


In [33]:
from stellargraph.data import EdgeSplitter

In [34]:
splitter = EdgeSplitter(stellarGraph)

In [35]:
graphTest, samplesTest, labelsTest = splitter.train_test_split(p=0.2)

** Sampled 17531 positive and 17531 negative edges. **


In [36]:
print(stellarGraph.info())

StellarGraph: Undirected multigraph
 Nodes: 24177, Edges: 87658

 Node types:
  entity: [15143]
    Features: float32 vector, length 6
    Edge types: entity-GPE->document, entity-ORG->document, entity-PERSON->document, entity-keywords->document
  document: [9034]
    Features: float32 vector, length 10000
    Edge types: document-GPE->entity, document-ORG->entity, document-PERSON->entity, document-keywords->entity

 Edge types:
    document-keywords->entity: [78828]
        Weights: range=[0.0827011, 1], mean=0.258479, std=0.0898449
        Features: none
    document-ORG->entity: [4275]
        Weights: range=[2, 24], mean=3.33427, std=2.38695
        Features: none
    document-GPE->entity: [3141]
        Weights: range=[2, 26], mean=3.1958, std=2.03227
        Features: none
    document-PERSON->entity: [1414]
        Weights: range=[2, 18], mean=3.17327, std=1.97911
        Features: none


In [37]:
print(graphTest.info())

StellarGraph: Undirected multigraph
 Nodes: 24177, Edges: 70127

 Node types:
  entity: [15143]
    Features: float32 vector, length 6
    Edge types: entity-GPE->document, entity-ORG->document, entity-PERSON->document, entity-keywords->document
  document: [9034]
    Features: float32 vector, length 10000
    Edge types: document-GPE->entity, document-ORG->entity, document-PERSON->entity, document-keywords->entity

 Edge types:
    document-keywords->entity: [63078]
        Weights: range=[0.0827011, 1], mean=0.258399, std=0.0897861
        Features: none
    document-ORG->entity: [3404]
        Weights: range=[2, 22], mean=3.31463, std=2.35368
        Features: none
    document-GPE->entity: [2529]
        Weights: range=[2, 26], mean=3.21669, std=2.04549
        Features: none
    document-PERSON->entity: [1116]
        Weights: range=[2, 18], mean=3.18907, std=2.03272
        Features: none


## Creating a Topic Classification Model 

We start by splitting the data into train, validation and test

In [38]:
targets = labels.reindex(documentFeatures.index).fillna(0)
#documentFeatures.drop(["entity", "document"], axis=1)

In [39]:
targets.head()

Unnamed: 0_level_0,earn,acq,money-fx,grain,crude,trade,interest,ship,wheat,corn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
training/9850,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/6208,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
test/18325,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
training/859,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/128,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
def train_test_split(corpus):
    graphIndex = [index for index in corpus.index]
    
    train_idx = [idx for idx in graphIndex if "training/" in idx]
    test_idx = [idx for idx in graphIndex if "test/" in idx]
    return corpus.loc[train_idx], corpus.loc[test_idx]

In [41]:
sampled, hold_out = train_test_split(targets)

In [42]:
allNeighbors = np.unique([n for node in sampled.index for n in stellarGraph.neighbors(node)])

In [43]:
subgraph = stellarGraph.subgraph(set(sampled.index).union(allNeighbors))

In [44]:
print(subgraph.info())

StellarGraph: Undirected multigraph
 Nodes: 17075, Edges: 63031

 Node types:
  entity: [10586]
    Features: float32 vector, length 6
    Edge types: entity-GPE->document, entity-ORG->document, entity-PERSON->document, entity-keywords->document
  document: [6489]
    Features: float32 vector, length 10000
    Edge types: document-GPE->entity, document-ORG->entity, document-PERSON->entity, document-keywords->entity

 Edge types:
    document-keywords->entity: [56639]
        Weights: range=[0.0918226, 1], mean=0.257404, std=0.0887759
        Features: none
    document-ORG->entity: [3126]
        Weights: range=[2, 22], mean=3.30742, std=2.29417
        Features: none
    document-GPE->entity: [2230]
        Weights: range=[2, 26], mean=3.23767, std=2.07487
        Features: none
    document-PERSON->entity: [1036]
        Weights: range=[2, 18], mean=3.17664, std=2.04459
        Features: none


In [45]:
from sklearn.model_selection import train_test_split

train, leftOut = train_test_split(
    sampled,
    train_size=0.1,
    test_size=None,
    random_state=42,
)

validation, test = train_test_split(
    leftOut, train_size=0.2, test_size=None, random_state=100,
)

In [46]:
validation = validation[validation.sum(axis=1) > 0]
test = test[test.sum(axis=1) > 0]

In [47]:
print(f"Validation: {validation.shape}")
print(f"Test: {test.shape}")

Validation: (1168, 10)
Test: (4673, 10)


### Training the Model

We start  by creating the model 

In [48]:
batch_size = 50
num_samples = [10, 5]

In [49]:
from stellargraph.mapper import HinSAGENodeGenerator

generator = HinSAGENodeGenerator(subgraph, batch_size, num_samples, head_node_type="document")

In [50]:
from stellargraph.layer import HinSAGE

graphsage_model = HinSAGE(
    layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5,
)

In [51]:
x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=train.shape[1], activation="sigmoid")(x_out)

In [52]:
prediction.shape

TensorShape([None, 10])

In [53]:
model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(learning_rate=0.005),
    loss=losses.binary_crossentropy,
    metrics=["acc"],
)


We now train the model 

In [54]:
train_gen = generator.flow(train.index, train, shuffle=True)

In [55]:
val_gen = generator.flow(validation.index, validation)

In [None]:
history = model.fit(
    train_gen, epochs=50, validation_data=val_gen, verbose=1, shuffle=False
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


In [None]:
sg.utils.plot_history(history)

In [None]:
history = model.fit(
    train_gen, epochs=50, validation_data=val_gen, verbose=1, shuffle=False
)

In [None]:
sg.utils.plot_history(history)

### Threshold identification

In [None]:
test_gen = generator.flow(test.index, test)

In [None]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
test_predictions = pd.DataFrame(model.predict(test_gen), index=test.index, columns=test.columns)

In [None]:
test_results = pd.concat({
    "target": test, 
    "preds": test_predictions
}, axis=1)

In [None]:
from sklearn.metrics import f1_score, classification_report

In [None]:
f1s = {}

for th in [0.01,0.05,0.1,0.2,0.3,0.4,0.5]:
    f1s[th] = f1_score(test_results["target"], 1.0*(test_results["preds"]>th), average="macro")
    
pd.Series(f1s).plot()

As it can be seen, with a threshold of about 0.2 we obtain the best performances. We thus use this value for producing the classification report

In [None]:
print(classification_report(test_results["target"], 1.0*(test_results["preds"]>0.2)))

### Inductive Prediction

We now provide a prediction truly inductive, thus we will be using the full graph and we will also use the threshold of 0.2 we have identified above as the one providing the top f1-score.  

In [None]:
generator = HinSAGENodeGenerator(stellarGraph, batch_size, num_samples, head_node_type="document")

In [None]:
hold_out = hold_out[hold_out.sum(axis=1) > 0]

In [None]:
hold_out_gen = generator.flow(hold_out.index, hold_out)

In [None]:
hold_out_predictions = model.predict(hold_out_gen)

In [None]:
preds = pd.DataFrame(1.0*(hold_out_predictions > 0.2), index=hold_out.index, columns=hold_out.columns)

In [None]:
results = pd.concat({
    "target": hold_out, 
    "preds": preds
}, axis=1)

In [None]:
print(classification_report(results["target"], results["preds"]))