### Nikolaos Giannopoulos AM 5199
### Team: Trump Tariffed My Datasets

In [None]:
import re
import networkx as nx
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pickle
import torch
from stellargraph import StellarGraph
from stellargraph import data

# Preprocess Authors

In [17]:
#Read the file
with open("authors.txt", "r", encoding="utf-8") as f:
    raw_lines = f.readlines()

#Parse into node and author
nodes, authors = zip(*[line.strip().split('|--|') for line in raw_lines])
df = pd.DataFrame({'node': nodes, 'author': authors})

#Clean function
def clean_author_list(author_string):
    authors = author_string.split(',')  #Split by commas first
    cleaned = []
    for name in authors:
        name = name.lower().strip()
        name = re.sub(r"[^\w\s]", "", name)  #Remove punctuation (but AFTER comma split)
        cleaned.append(name)
    return cleaned

#Apply cleaning
df['cleaned_author'] = df['author'].apply(clean_author_list)

#Convert to dict
authors_dict = dict(zip(df['node'].astype(int), df['cleaned_author']))

#Save to pickle
with open("Data/authors_preprocessing.pkl", "wb") as f:
    pickle.dump(authors_dict, f)

print("Authors preprocessing was saved to Data/authors_preprocessing.pkl")
df.head()

Authors preprocessing was saved to Data/authors_preprocessing.pkl


Unnamed: 0,node,author,cleaned_author
0,0,"James H. Niblock,Jian-Xun Peng,Karen R. McMene...","[james h niblock, jianxun peng, karen r mcmene..."
1,1,"Jian-Xun Peng,Kang Li,De-Shuang Huang","[jianxun peng, kang li, deshuang huang]"
2,2,J. Heikkila,[j heikkila]
3,3,"L. Teslic,B. Hartmann,O. Nelles,I. Skrjanc","[l teslic, b hartmann, o nelles, i skrjanc]"
4,4,"Long Zhang,Kang Li,Er-Wei Bai,George W. Irwin","[long zhang, kang li, erwei bai, george w irwin]"


# Preprocess Abstracts

In [18]:
#Set up tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

#Read abstracts.txt
with open("abstracts.txt", "r", encoding="utf-8") as f:
    raw_lines = f.readlines()

#Parse node and abstract
nodes, abstracts = zip(*[line.strip().split('|--|') for line in raw_lines])
df = pd.DataFrame({'node': nodes, 'abstract': abstracts})

#Preprocessing function that returns list of tokens
def preprocess_abstract_to_list(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)         #Remove punctuation
    tokens = word_tokenize(text)                #Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

#Apply it
df['tokenized_abstract'] = df['abstract'].apply(preprocess_abstract_to_list)

#Convert to dict {node: [token1, token2, ...]}
abstracts_tokenized = dict(zip(df['node'].astype(int), df['tokenized_abstract']))

#Save to pickle file
with open("Data/abstracts_tokenized.pkl", "wb") as f:
    pickle.dump(abstracts_tokenized, f)

#Testing
print("Tokenized abstracts saved to Data/abstracts_tokenized.pkl")
abstracts_tokenized[0]

Tokenized abstracts saved to Data/abstracts_tokenized.pkl


['development',
 'automated',
 'system',
 'quality',
 'assessment',
 'aerodrome',
 'ground',
 'lighting',
 'agl',
 'accordance',
 'associated',
 'standard',
 'recommendation',
 'presented',
 'system',
 'composed',
 'image',
 'sensor',
 'placed',
 'inside',
 'cockpit',
 'aircraft',
 'record',
 'image',
 'agl',
 'normal',
 'descent',
 'aerodrome',
 'modelbased',
 'methodology',
 'used',
 'ascertain',
 'optimum',
 'match',
 'template',
 'agl',
 'actual',
 'image',
 'data',
 'order',
 'calculate',
 'position',
 'orientation',
 'camera',
 'instant',
 'image',
 'acquired',
 'camera',
 'position',
 'orientation',
 'data',
 'used',
 'along',
 'pixel',
 'grey',
 'level',
 'imaged',
 'luminaire',
 'estimate',
 'value',
 'luminous',
 'intensity',
 'given',
 'luminaire',
 'compared',
 'expected',
 'brightness',
 'luminaire',
 'ensure',
 'operating',
 'required',
 'standard',
 'metric',
 'quality',
 'agl',
 'pattern',
 'determined',
 'experiment',
 'real',
 'image',
 'data',
 'presented',
 'demonst

In [6]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

2.8.0.dev20250526+cu128
True
NVIDIA GeForce RTX 5070 Ti


In [19]:
dict(list(abstracts_tokenized.items())[:5])

{0: ['development',
  'automated',
  'system',
  'quality',
  'assessment',
  'aerodrome',
  'ground',
  'lighting',
  'agl',
  'accordance',
  'associated',
  'standard',
  'recommendation',
  'presented',
  'system',
  'composed',
  'image',
  'sensor',
  'placed',
  'inside',
  'cockpit',
  'aircraft',
  'record',
  'image',
  'agl',
  'normal',
  'descent',
  'aerodrome',
  'modelbased',
  'methodology',
  'used',
  'ascertain',
  'optimum',
  'match',
  'template',
  'agl',
  'actual',
  'image',
  'data',
  'order',
  'calculate',
  'position',
  'orientation',
  'camera',
  'instant',
  'image',
  'acquired',
  'camera',
  'position',
  'orientation',
  'data',
  'used',
  'along',
  'pixel',
  'grey',
  'level',
  'imaged',
  'luminaire',
  'estimate',
  'value',
  'luminous',
  'intensity',
  'given',
  'luminaire',
  'compared',
  'expected',
  'brightness',
  'luminaire',
  'ensure',
  'operating',
  'required',
  'standard',
  'metric',
  'quality',
  'agl',
  'pattern',
  

# StellarGraph to split the nodes for the embeddings

In [22]:
nx_G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
G = StellarGraph.from_networkx(nx_G, node_type_default="paper", edge_type_default="cites")

In [27]:
edge_splitter_test = data.EdgeSplitter(G)
graph_train_test, train_pairs, train_labels = edge_splitter_test.train_test_split(p=0.25, method="global", keep_connected=True, seed=42) #Pick 25% as samples

#Save the files with the data
nx.write_edgelist(graph_train_test.to_networkx(),'Data/edgelist_train.txt',delimiter=',',data=False)
pd.DataFrame(train_pairs).to_csv('Data/train_pairs.csv', index=False, header=False)
pd.DataFrame(train_labels).to_csv('Data/train_labels.csv', index=False, header=False)

** Sampled 272988 positive and 272988 negative edges. **
