# Graph Neural Network Topic Classifier

In the following we will focus on building a model for topic classification based on a Graph Neural Network approach.

In particular in the following we will show you how to:

* Create a TF-IDF representation of the corpus, that will be used as node features in the Graph Neural Network model 
* Build, train a Graph Neural Network model and identify the best threshold for classifying documents 

**NOTE: This Notebook can only be run after the 01_nlp_graph_creation notebook, as some of the results computed in the first notebook will be here reused.**

### Load Dataset

In [None]:
import numpy as np
import pandas as pd

In [None]:
corpus = pd.read_pickle("corpus.p")

In [None]:
corpus.head()

In [None]:
from collections import Counter
topics = Counter([label for document_labels in corpus["label"] for label in document_labels]).most_common(10)

In [None]:
topics

In [None]:
topicsList = [topic[0] for topic in topics]
topicsSet = set(topicsList)
dataset = corpus[corpus["label"].apply(lambda x: len(topicsSet.intersection(x))>0)]

In [None]:
def get_labels(corpus, topicsList=topicsList):
    return corpus["label"].apply(
        lambda labels: pd.Series({label: 1 for label in labels}).reindex(topicsList).fillna(0)
    )[topicsList]

In [None]:
labels = get_labels(dataset)

In [None]:
labels.head()

In [None]:
def get_features(corpus):
    return corpus["parsed"]

In [None]:
def get_features_and_labels(corpus):
    return get_features(corpus), get_labels(corpus)

In [None]:
def train_test_split(corpus):
    train_idx = [idx for idx in corpus.index if "training/" in idx]
    test_idx = [idx for idx in corpus.index if "test/" in idx]
    return corpus.loc[train_idx], corpus.loc[test_idx]

In [None]:
train, test = train_test_split(dataset)

In [None]:
def my_spacy_tokenizer(pos_filter=["NOUN", "VERB", "PROPN"]):
    def tokenizer(doc):
        return [token.lemma_ for token in doc if (pos_filter is None) or (token.pos_ in pos_filter)] 
    return tokenizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cntVectorizer = TfidfVectorizer(
    analyzer=my_spacy_tokenizer(),
    max_df = 0.25, min_df = 2, max_features = 10000
)

In [None]:
trainFeatures, _ = get_features_and_labels(train)
testFeatures, _ = get_features_and_labels(test)


In [None]:
trainedTransformed = cntVectorizer.fit_transform(trainFeatures)
testTransformed = cntVectorizer.transform(testFeatures)

In [None]:
features = pd.concat([
    pd.DataFrame.sparse.from_spmatrix(trainedTransformed, index=trainFeatures.index), 
    pd.DataFrame.sparse.from_spmatrix(testTransformed, index=testFeatures.index)
])

In [None]:
features.shape

Creating the Graph

In [None]:
from torch_geometric.data import HeteroData

In [None]:
edges = pd.read_pickle("bipartiteEdges.p")

In [None]:
entityTypes = {entity: ith for ith, entity in enumerate(edges["type"].unique())}

In [None]:
entityTypes

In [None]:
documentFeatures = features.loc[list(set(corpus.index).intersection(features.index))] #.assign(document=1, entity=0)

In [None]:
documentFeatures.head()

In [None]:
entities = edges.groupby(["target", "type"])["source"].count().groupby(level=0).apply(
    lambda s: s.droplevel(0).reindex(entityTypes.keys()).fillna(0)
).unstack(level=1)

In [None]:
entityFeatures = (entities.T / entities.sum(axis=1)).T.assign(document=0, entity=1)

In [None]:
nodes = {"entity": entityFeatures, 
         "document": documentFeatures}

In [None]:
targets = labels.reindex(documentFeatures.index).fillna(0)

In [None]:
def train_test_split(corpus):
    graphIndex = [index for index in corpus.index]
    
    train_idx = [idx for idx in graphIndex if "training/" in idx]
    test_idx = [idx for idx in graphIndex if "test/" in idx]
    return corpus.loc[train_idx], corpus.loc[test_idx]

In [None]:
sampled, hold_out = train_test_split(targets)

In [None]:
from sklearn.model_selection import train_test_split

train, leftOut = train_test_split(
    sampled,
    train_size=0.1,
    random_state=42,
)

validation, test = train_test_split(
    leftOut, train_size=0.2, test_size=None, random_state=100,
)

In [None]:
train = train[train.sum(axis=1) > 0]
validation = validation[validation.sum(axis=1) > 0]
test = test[test.sum(axis=1) > 0]

In [None]:
print(f"Train: {train.shape}")
print(f"Validation: {validation.shape}")
print(f"Test: {test.shape}")

In [None]:
docs_maps = {k: ith for ith, k in enumerate(documentFeatures.index)}

In [None]:
ents_maps = {k: ith for ith, k in enumerate(entityFeatures.index)}

In [None]:
labs_maps = {k: ith for ith, k in enumerate(labels.columns)}

In [None]:
edges["source_id"] = edges["source"].apply(lambda x: docs_maps.get(x, -1))
edges["target_id"] = edges["target"].apply(lambda x: ents_maps.get(x, -1))

In [None]:
import torch
import torch_sparse

def df_to_torch(df: pd.DataFrame):
    try:
        # @amarzullo: needs to be torch_sparse coo
        coo = df.sparse_to_coo()
        return torch_sparse.coalesce(coo.coords, coo.data, coo.shape)
        #coo = df.sparse.to_coo()
        #return torch.sparse_coo_tensor(coo.coords, coo.data, coo.shape) #.to_sparse_csr()
    except AttributeError:
        return torch.from_numpy(df.values)

In [None]:
data = HeteroData()

data["document"].x = df_to_torch(documentFeatures)#.to_dense() #@amarzullo to_dense
data["entity"].x = df_to_torch(entityFeatures)#.to_dense() #@amarzullo to_dense

In [None]:
for _type, group in edges[(edges["source_id"]!=-1) * (edges["target_id"]!=-1)].groupby("type"):
    data[("document", _type, "entity")].edge_index = df_to_torch(group[["source_id", "target_id"]].T)

In [None]:
data["document"].y = df_to_torch(targets).to(torch.float)

In [None]:
data["document"]["train_mask"] = df_to_torch(train.sum(axis=1).reindex(documentFeatures.index).fillna(0)).to(torch.bool)
data["document"]["val_mask"] = df_to_torch(validation.sum(axis=1).reindex(documentFeatures.index).fillna(0)).to(torch.bool)
data["document"]["test_mask"] = df_to_torch(test.sum(axis=1).reindex(documentFeatures.index).fillna(0)).to(torch.bool)

In [None]:
data

In [None]:
import torch_geometric.transforms as T

In [None]:
data = T.ToUndirected()(data)

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = x.float() #@amarzullo
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return F.sigmoid(x)


model = GNN(hidden_channels=64, out_channels=len(labs_maps))
model = to_hetero(model, data.metadata(), aggr='sum')

In [None]:
with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x_dict, data.edge_index_dict)

In [None]:
device = torch.device("cpu")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
from dataclasses import dataclass

@dataclass
class Accuracy:
    correct: int
    total: int

    @property
    def score(self):
        return float(self.correct) * 1.0 / self.total

    def __add__(self, other: 'Accuracy'):
        if not isinstance(other, Accuracy):
            raise ValueError("Cannot add objects other than Accuracy")

        return Accuracy(self.correct+other.correct, self.total+other.total)

In [None]:
def score(correct):
    return Accuracy(int(correct.sum()), int(np.prod(correct.shape)))

In [None]:
def training(data, train_mask):
    model.train()

    # zero the parameter gradients
    optimizer.zero_grad()

    out = model(data.x_dict, data.edge_index_dict)
        
    loss = F.binary_cross_entropy(out['document'][train_mask], data['document'].y[train_mask])
    loss.backward()
    optimizer.step()

    return float(loss)

@torch.no_grad()
def eval(data, mask):
     # Test/Evaluate
    model.eval()

    out = model(data.x_dict, data.edge_index_dict)["document"][mask]

    pred = (1.0*(out>0.5) == data["document"].y[mask])
    
    return score(pred)

In [None]:
train_mask = data['document'].train_mask
val_mask = data['document'].val_mask

for epoch in range(10):  # loop over the dataset multiple times

    loss = training(data, train_mask)
        
    # Test/Evaluate
    train_score, val_score = eval(data, train_mask), eval(data, val_mask)

    print(f"Epoch {epoch} => Training: {train_score.score} Validation: {val_score.score}")

### With batches

In [None]:
from torch_geometric.loader import NeighborLoader

In [None]:
train_input_nodes = ('document', data['document'].train_mask)
val_input_nodes = ('document', data['document'].val_mask)
kwargs = {'batch_size': 128}

In [None]:
train_loader = NeighborLoader(data, num_neighbors=[10] * 2, shuffle=True,
                                  input_nodes=train_input_nodes, **kwargs)
val_loader = NeighborLoader(data, num_neighbors=[10] * 2,
                                input_nodes=val_input_nodes, **kwargs)

In [None]:
train_score = Accuracy(0, 0)
for nth, batch in enumerate(train_loader):
    batch_size = batch['document'].batch_size
    train_mask = range(batch_size)
    train_score += eval(batch, train_mask)

val_score = Accuracy(0, 0)
for nth, batch in enumerate(val_loader):
    batch_size = batch['document'].batch_size
    train_mask = range(batch_size)
    val_score += eval(batch, train_mask)

In [None]:
for epoch in range(1):
    loss = 0
    for nth, batch in enumerate(train_loader):
        batch_size = batch['document'].batch_size
        train_mask = range(batch_size)
        loss += training(batch, train_mask)*batch_size

    # Training error
    train_score = Accuracy(0, 0)
    for nth, batch in enumerate(train_loader):
        batch_size = batch['document'].batch_size
        train_mask = range(batch_size)
        train_score += eval(batch, train_mask)

    val_score = Accuracy(0, 0)
    for nth, batch in enumerate(val_loader):
        batch_size = batch['document'].batch_size
        train_mask = range(batch_size)
        val_score += eval(batch, train_mask)
    
    print(f"Epoch {epoch} => Loss: {loss} Train: {train_score.score} Val: {val_score.score}")

### Threshold identification

In [None]:
test_input_nodes = ('document', data['document'].test_mask)
kwargs = {'batch_size': 128}

In [None]:
test_loader = NeighborLoader(data, num_neighbors=[10] * 2, input_nodes=test_input_nodes, **kwargs)

In [None]:
@torch.no_grad()
def get_output(data, mask):
     # Test/Evaluate
    model.eval()

    out = model(data.x_dict, data.edge_index_dict)["document"][mask]

    return pd.DataFrame(out)

def reindex(df, indices):
    df.index = indices
    return df

In [None]:
def remap_index(df, docs_maps, labs_maps):
    inv_docs_maps = {v:k for k, v in docs_maps.items()}
    inv_labs_maps = {v:k for k, v in labs_maps.items()}
    
    df.index = [inv_docs_maps[x] for x in df.index]
    df.columns = [inv_labs_maps[x] for x in df.columns]
    return df

In [None]:
preds = []
for nth, batch in enumerate(test_loader):
    batch_size = batch['document'].batch_size
    train_mask = range(batch_size)
    preds.append(
        remap_index(
            reindex(
                get_output(batch, train_mask), 
                batch["document"].input_id.tolist()
            ),
            docs_maps,
            labs_maps
        )
    )

In [None]:
test_predictions = pd.concat(preds)

In [None]:
test_results = pd.concat({
    "target": test, 
    "preds": test_predictions
}, axis=1)

In [None]:
from sklearn.metrics import f1_score, classification_report

In [None]:
f1s = {}

for th in [0.01,0.05,0.1,0.2,0.3,0.4,0.5]:
    f1s[th] = f1_score(test_results["target"], 1.0*(test_results["preds"]>th), average="macro")
    
pd.Series(f1s).plot()

In [None]:
print(classification_report(test_results["target"], 1.0*(test_results["preds"]>0.2)))