# Preparation

In [54]:
# import dependencies
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import networkx as nx
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras import layers, Model

In [41]:
# Load the dataset
cites_df = pd.read_csv('./cora/cora.cites', sep='\t',header=None, names=["start", "end"]) 
content_df = pd.read_csv('./cora/cora.content', sep='\t', header=None)
# The first column is the paper ID, the last is the class label, the rest are the features.

In [49]:
graph = nx.DiGraph()

# Add nodes with features
for _, row in content_df.iterrows():
    node_id = row[0]
    features = row[1:-1].values
    label = row.values[-1]
    graph.add_node(node_id, feature=features, label=label)

# Add edges from citations
for _, row in cites_df.iterrows():
    src = row['start']
    dst = row['end']
    graph.add_edge(src, dst)

In [55]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(content_df.iloc[:, -1])
onehot_encoder = OneHotEncoder(sparse_output=False)
y_onehot = onehot_encoder.fit_transform(y_encoded.reshape(-1, 1))
num_classes = y_onehot.shape[1]

In [24]:
paper_ids = content_df.iloc[:, 0].values
subjects = np.unique(content_df.iloc[:, -1].values)
num_nodes = len(paper_ids)

In [None]:
# Create the directed graph from the cites data
G = nx.from_pandas_edgelist(cites_df, 'start', 'end', create_using=nx.DiGraph())
# adjacency matrix
adj_matrix = nx.to_numpy_array(G, nodelist=np.array(list(G.nodes())))


# Modelling

## GCN

3. Develop a machine learning approach to predict the subjects of papers.
4. Train your approach on 9 folds, evaluate it on 1 fold, repeat this process 10 times, and concatenate your predictions such that you have a prediction for every data point in the end.
5. Store your predictions in a file as tab-separated values (TSV) in the format <paper_id> <class_label> where class_label is a string.
6. Evaluate your approach in terms of accuracy indicating the percentage of nodes that were predicted correctly.

1. Create a GitHub repository, grant access to the GitHub user heindorf and send an email with the name of the repository to heindorf@uni-paderborn.de.
2. Upload your code (Python preferred) to the repository.
3. Upload your predictions (TSV file) to the repository.
4. Document your approach in the README file. \
Describe what dependencies are required and how to run your code-ideally with a single command. \
Moreover, briefly explain the core idea of your approach in the README file.