In [1]:
import json
import numpy as np
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC
import collections
from math import log10
import keras_tuner
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import Dense
from keras.models import Sequential
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from SMLP2_Func import generate_coauthors, onehot_func, coauthors_onehot, decode_func, convert_to_csv
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Read in Training and Test Data as Dataframe

In [2]:
f = open('train.json')
train_data = json.load(f)
train_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
train_y = pd.DataFrame(columns=["authors"])
authors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(train_data)): 
    authors_list.append(np.array(train_data[i]['authors']))
    year_list.append(np.array(train_data[i]['year']))
    venue_list.append(np.array(train_data[i]['venue']))
    title_list.append(np.array(train_data[i]['title']))
    abstract_list.append(np.array(train_data[i]['abstract']))
train_y["authors"] = authors_list
train_X["year"] = year_list
train_X["venue"] = venue_list
train_X["title"] = title_list
train_X["abstract"] = abstract_list

In [3]:
f = open('test.json')
test_data = json.load(f)
test_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
coauthors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(test_data)): 
    coauthors_list.append(np.array(test_data[i]['coauthors']))
    year_list.append(np.array(test_data[i]['year']))
    venue_list.append(np.array(test_data[i]['venue']))
    title_list.append(np.array(test_data[i]['title']))
    abstract_list.append(np.array(test_data[i]['abstract']))
test_X["year"] = year_list
test_X["venue"] = venue_list
test_X["title"] = title_list
test_X["abstract"] = abstract_list
test_X["coauthors"] = coauthors_list

## Define Variables

In [4]:
wordbag_len = 4999
authors_num = 21245
prolific_num = 100
venue_num = 465

## Splitting Authors' list into Response (Profilic Authors) & Predictors (Co-Authors)

In [5]:
prolific_authors_list = []
coauthors_list = []
for authors in authors_list:
    prolific_authors, coauthors = generate_coauthors(authors)
    prolific_authors_list.append(prolific_authors)
    coauthors_list.append(coauthors)

train_y["authors"] = prolific_authors_list
train_X["coauthors"] = coauthors_list

## Convert Features 'title', 'abstract', 'coauthors' & 'authors' into One-hot Encoding Style

In [6]:
# train_X = onehot_func(wordbag_len, train_X, "title")
# train_X = onehot_func(wordbag_len, train_X, "abstract")
train_X = onehot_func(authors_num, train_X, "coauthors")
train_X = coauthors_onehot(prolific_num, train_X, "coauthors")
# train_X = onehot_func(venue_num, train_X, "venue")
train_y = onehot_func(prolific_num, train_y, "authors")

# test_X = onehot_func(wordbag_len, test_X, "title")
# test_X = onehot_func(wordbag_len, test_X, "abstract")
test_X = onehot_func(authors_num, test_X, "coauthors")
test_X = coauthors_onehot(prolific_num, test_X, "coauthors")
# test_X = onehot_func(venue_num, test_X, "venue")

In [7]:
'''Function for Capturing the maximum length of the selected feature's element'''
def feature_max_len(feature, train_data, test_data):
    max_length = 0
    for i in range(train_X.shape[0]):
        if len(train_X[feature][i]) >= max_length:
            max_length = len(train_X[feature][i])
    for i in range(test_X.shape[0]):
        if len(test_X[feature][i]) >= max_length:
            max_length = len(test_X[feature][i])
    return max_length
title_len = feature_max_len("title", train_X, test_X)
abstract_len = feature_max_len("abstract", train_X, test_X)

In [8]:
'''Function for padding feature's element with maximum length value'''
def feature_pad(feature, train_data, test_data, max_len):
    for i in range(train_data.shape[0]):
        A = train_data[feature][i]
        train_data[feature][i] = np.pad(A, (0, max_len - len(A)), 'constant')
    for i in range(test_data.shape[0]):
        A = test_data[feature][i]
        test_data[feature][i] = np.pad(A, (0, max_len - len(A)), 'constant')
    return train_data, test_data   
train_X, test_X = feature_pad("title", train_X, test_X, title_len)
train_X, test_X = feature_pad("abstract", train_X, test_X, abstract_len)

In [9]:
'''Function for converting the dataframe into a 2D np.array for model fitting'''
def fit_structure(data, feature):
    struct_list = []
    for title in np.array(data[feature]): 
        struct_list.append(list(title))
    out_list = np.array(struct_list)
    return out_list
train_title_list = fit_structure(train_X, "title")
train_abstract_list = fit_structure(train_X, "abstract")
train_coauthor_list = fit_structure(train_X, "coauthors")
train_author_list = fit_structure(train_y, "authors")
test_title_list = fit_structure(test_X, "title")
test_abstract_list = fit_structure(test_X, "abstract")
test_coauthor_list = fit_structure(test_X, "coauthors")

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(train_title_list, train_author_list, test_size = 0.1, random_state = 42)
print(train_title_list.shape,train_author_list.shape)
print(X_test.shape,Y_test.shape)

(25793, 127) (25793, 100)
(2580, 127) (2580, 100)


In [11]:
model = Sequential()
model.add(Embedding(127, 100, input_length=train_title_list.shape[1]))
model.add(SpatialDropout1D(0.1))
model.add(LSTM(200, dropout=0.1, recurrent_dropout=0.2))
model.add(Dense(100, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['AUC'])
print(model.summary())

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 127, 100)          12700     
                                                                 
 spatial_dropout1d (SpatialD  (None, 127, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 200)               240800    
                                                                 
 dense (Dense)               (None, 100)               20100     
                                                                 
Total params: 273,600
Trainable params: 273,600
Non-trainable params: 0
_________________________________________________________________
None


2022-10-04 21:28:03.359109: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-04 21:28:03.359440: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [13]:
batch_size = 64
epochs = 2
history = model.fit(train_title_list, train_author_list, epochs=epochs, batch_size = batch_size, validation_split=0.1)

Epoch 1/2
  9/363 [..............................] - ETA: 3:24:08 - loss: 1.4578 - auc: 0.5000