# MLP Approach with Different Extra Implemented Algorithm

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
import os
from SMLP2_Func import generate_coauthors, onehot_func, coauthors_onehot
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation, Dense
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from numpy import mean
from numpy import std
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import RepeatedKFold
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
from SMLP2_Func import decode_func
import tensorflow as tf


## Read in Training and Test Data as Dataframe

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
f = open('train.json')
train_data = json.load(f)
train_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
train_y = pd.DataFrame(columns=["authors"])
authors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(train_data)): 
    authors_list.append(np.array(train_data[i]['authors']))
    year_list.append(np.array(train_data[i]['year']))
    venue_list.append(np.array(train_data[i]['venue']))
    title_list.append(np.array(train_data[i]['title']))
    abstract_list.append(np.array(train_data[i]['abstract']))

train_y["authors"] = authors_list
train_X["year"] = year_list
train_X["venue"] = venue_list
train_X["title"] = title_list
train_X["abstract"] = abstract_list

In [None]:
f = open('test.json')
test_data = json.load(f)
test_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
coauthors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(test_data)): 
    coauthors_list.append(np.array(test_data[i]['coauthors']))
    year_list.append(np.array(test_data[i]['year']))
    venue_list.append(np.array(test_data[i]['venue']))
    title_list.append(np.array(test_data[i]['title']))
    abstract_list.append(np.array(test_data[i]['abstract']))

test_X["year"] = year_list
test_X["venue"] = venue_list
test_X["title"] = title_list
test_X["abstract"] = abstract_list
test_X["coauthors"] = coauthors_list


## Define Variables

In [None]:
wordbag_len = 4999
authors_num = 21245
profilic_num = 100
profilic_end = 99
venue_num = 465

## Splitting Authors' list into Response (Profilic Authors) & Predictors (Co-Authors)

In [None]:
prolific_authors_list = []
coauthors_list = []
for authors in authors_list: 
    prolific_authors, coauthors = generate_coauthors(authors)
    prolific_authors_list.append(prolific_authors)
    coauthors_list.append(coauthors)

train_y["authors"] = prolific_authors_list
train_X["coauthors"] = coauthors_list

### Potential Method (Single Label) Accompanied with the MLP

In [None]:
# for i in range(len(train_y)): 
#     if len(train_y["authors"][i]) >= 1: 
#         train_y["authors"][i] = [train_y["authors"][i][0]]

## Convert Features 'title', 'abstract', 'coauthors' & 'authors' into One-hot Encoding Style

In [None]:
train_X = onehot_func(wordbag_len, train_X, "title")
train_X = onehot_func(wordbag_len, train_X, "abstract")
train_X = onehot_func(authors_num, train_X, "coauthors")
train_X = coauthors_onehot(profilic_end, train_X, "coauthors")
train_X = onehot_func(venue_num, train_X, "venue")
train_y = onehot_func(profilic_num, train_y, "authors")

test_X = onehot_func(wordbag_len, test_X, "title")
test_X = onehot_func(wordbag_len, test_X, "abstract")
test_X = onehot_func(authors_num, test_X, "coauthors")
test_X = coauthors_onehot(profilic_end, test_X, "coauthors")
test_X = onehot_func(venue_num, test_X, "venue")

### Potential Method (Removing Non-Prolific Authors Instance) Accompanied with MLP

In [None]:
# empty_row = []
# for i in range(len(train_y['authors'])):
#     if sum(train_y['authors'][i]) == 0:
#         empty_row.append(i)
# train_y_empty = train_y.iloc[empty_row]
# train_y_empty = train_y_empty.sample(n = 6000)
# indexdrop = train_y_empty.index
# train_y.drop(indexdrop, axis = 0, inplace = True)
# train_X.drop(indexdrop, axis = 0, inplace = True)

### Extract X, Y Variable(s)

In [None]:
X_lst = []
for title in np.array(train_X['coauthors']): 
    X_lst.append(list(title))

X_array = np.array(X_lst)

y_lst = []
for author in np.array(train_y['authors']): 
    y_lst.append(list(author))

x_list_test = []
for title in np.array(test_X['coauthors']): 
    x_list_test.append(list(title))
x_test_array = np.array(x_list_test)

y_array = np.array(y_lst)

In [None]:
# mlp for multi-label classification
threhold = 0.29
f1s = []
# get the model
def get_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(200, activation='relu'))
	model.add(Dense(n_outputs, activation='sigmoid'))
	# model.compile(optimizer='adam', loss=f1_loss, metrics=['accuracy', f1])
	model.compile(loss='binary_crossentropy', optimizer='adam')
	return model
 
# evaluate a model using repeated k-fold cross-validation
def evaluate_model(X, y, threhold):
	results = list()
	n_inputs, n_outputs = X.shape[1], y.shape[1]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

	# X_train = tf.cast(X_train, tf.float32)
	# X_test = tf.cast(X_test, tf.float32)
	# y_train = tf.cast(y_train, tf.float32)
	# y_test = tf.cast(y_test, tf.float32)

	model = get_model(n_inputs, n_outputs)
	# fit model
	model.fit(X_train, y_train, verbose=0, epochs=30)
	# make a prediction on the test set
	yhat = model.predict(X_test)
	# round probabilities to class labels

	# yhat = yhat.round()
	yhat = (yhat > threhold).astype(int)

	# calculate accuracy
	# acc = accuracy_score(y_test, yhat)
	# print("yessss")
	f1 = np.mean(f1_score(y_test, yhat, average=None))
	# store result
	# print('>%.3f' % acc)
	print('>%.3f' % f1)

	del model
	
	return yhat, f1

# evaluate model
for i in np.arange(0.01, 0.5, 0.02): 
	print(i)
	train_y_pred, f1 = evaluate_model(X_array, y_array, i)
	f1s.append(f1)

# Plot the visuliastion of threshold
plt.plot(np.arange(0.01, 0.5, 0.02), f1s)

In [None]:
def predict_model(X_test, X_train, y_train, threhold):
	results = list()
	n_inputs, n_outputs = X_test.shape[1], 100
	model = get_model(n_inputs, n_outputs)
	# fit model
	model.fit(X_train, y_train, verbose=0, epochs=30)
	# make a prediction on the test set
	yhat = model.predict(X_test)
	# round probabilities to class labels
	yhat = (yhat > threhold).astype(int)
	# yhat = yhat.round()
	del model
	return yhat

In [None]:
'''Function for converting the index from one-hot encoding vector style to actual author IDs as output'''
def decode_func(y_pred): 
    output = []
    for y_hat in y_pred:
        if sum(y_hat) == 0:
            output.append(-1)
        else: 
            non_zeros = []
            for i in range(len(y_hat)): 
                if y_hat[i] == 1:
                    non_zeros.append(i+1)
            output.append(' '.join(map(str, non_zeros)))
    return output

In [None]:
random_state = 100
result_dict = {}
for i in range(len(test_X)):
    result_dict[i] = []
for random_state in range(1, random_state):
    print(random_state)
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.33, random_state=random_state)
    y_test_pred = predict_model(x_test_array, X_train, y_train, threhold)
    y_pred_ids = decode_func(y_test_pred)
    output = pd.DataFrame(columns=["Id", "Predict"])
    output['Id'] = range(len(y_pred_ids))
    output["Predict"] = y_pred_ids
    for index, row in output.iterrows():
        result_dict[index].append(row['Predict'])

In [None]:
y_pred = predict_model(x_test_array, X_array, y_array, threhold)

In [None]:
'''Function for converting the index from one-hot encoding vector style to actual author IDs as output'''
def decode_func(y_pred): 
    output = []
    for y_hat in y_pred:
        if sum(y_hat) == 0:
            output.append(-1)
        else: 
            non_zeros = []
            for i in range(len(y_hat)): 
                if y_hat[i] == 1:
                    non_zeros.append(i+1)
            output.append(' '.join(map(str, non_zeros)))
    return output

def convert_to_csv(y_pred_ids): 
    '''Convert output to csv'''
    output = pd.DataFrame(columns=["Id", "Predict"])
    output['Id'] = range(len(y_pred_ids))
    output["Predict"] = y_pred_ids
    output.to_csv('output.csv', index=False)

## Convert the output to CSV

In [None]:
y_pred_ids = decode_func(y_pred)
convert_to_csv(y_pred_ids)