In [1]:
import json
import numpy as np
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC
import collections
from math import log10
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from SMLP2_Func_Redundant import generate_coauthors, onehot_func, coauthors_onehot, decode_func, convert_to_csv, train_grouped_df, sub_split_authors, sub_onehot
import warnings
warnings.filterwarnings("ignore")

## Read in Training and Test Data as Dataframe

In [2]:
f = open('train.json')
train_data = json.load(f)
train_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
train_y = pd.DataFrame(columns=["authors"])
authors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(train_data)): 
    authors_list.append(np.array(train_data[i]['authors']))
    year_list.append(np.array(train_data[i]['year']))
    venue_list.append(np.array(train_data[i]['venue']))
    title_list.append(np.array(train_data[i]['title']))
    abstract_list.append(np.array(train_data[i]['abstract']))

train_y["authors"] = authors_list
train_X["year"] = year_list
train_X["venue"] = venue_list
train_X["title"] = title_list
train_X["abstract"] = abstract_list

In [3]:
f = open('test.json')
test_data = json.load(f)
test_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
coauthors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(test_data)): 
    coauthors_list.append(np.array(test_data[i]['coauthors']))
    year_list.append(np.array(test_data[i]['year']))
    venue_list.append(np.array(test_data[i]['venue']))
    title_list.append(np.array(test_data[i]['title']))
    abstract_list.append(np.array(test_data[i]['abstract']))

test_X["year"] = year_list
test_X["venue"] = venue_list
test_X["title"] = title_list
test_X["abstract"] = abstract_list
test_X["coauthors"] = coauthors_list

## Define Variables

In [4]:
wordbag_len = 4999
authors_num = 21245
prolific_num = 100
venue_num = 465

## Splitting Authors' list into Response (Profilic Authors) & Predictors (Co-Authors)
## Convert Features 'title', 'abstract', 'coauthors' & 'authors' into One-hot Encoding Style

In [5]:
train_group_dflist, train_y_dflist = train_grouped_df(train_y, train_X, venue_num)
train_group_dflist, train_y_dflist = sub_split_authors(train_group_dflist, train_y_dflist)
train_group_dflist, train_y_dflist = sub_onehot(train_group_dflist, train_y_dflist, wordbag_len, authors_num, prolific_num, venue_num)

test_X = onehot_func(wordbag_len, test_X, "title")
test_X = onehot_func(wordbag_len, test_X, "abstract")
test_X = onehot_func(authors_num, test_X, "coauthors")
test_X = coauthors_onehot(prolific_num, test_X, "coauthors")

coauthors_list_test = []
for title in np.array(test_X['coauthors']): 
    coauthors_list_test.append(list(title))
x_list_test = np.array(coauthors_list_test)

In [6]:
model_list = []
for i in range(len(train_group_dflist)): 
    coauthors_list = []
    for j in np.array(train_group_dflist[i]['coauthors']): 
        coauthors_list.append(list(j))
    x_list = np.array(coauthors_list)

    y_list = []
    for j in np.array(train_y_dflist[i]['authors']): 
        y_list.append(list(j))
    y_list = np.array(y_list)

    classifier = LabelPowerset(GaussianNB())
    if train_group_dflist[i].shape[0] == 0:
        model_list.append("NO_MODEL")
    else:
        model_list.append(classifier.fit(x_list, y_list))

In [None]:
y_pred_list = []
for i in range(len(x_list_test)):
    venue_num = (test_X.iloc[i]["venue"])
    if venue_num != "":
        if model_list[venue_num] == "NO_MODEL":
            y_pred_list.append(-1)
        else:
            y_pred_list.append(model_list[venue_num].predict(np.array([list(x_list_test[i])])))
    else:
        y_pred_list.append(model_list[-1].predict(np.array([list(x_list_test[i])])))

In [None]:
output = []
for i in range(len(y_pred_list)):
    if type(y_pred_list[i]) == int:
        output.append(-1)
    else:
        if sum(sum(y_pred_list[i].toarray())) == 0:
            output.append(-1)
        else:
            non_zeros = []
            y_hat = y_pred_list[i].toarray()[0]
            for i in range(len(y_hat)): 
                if y_hat[i] == 1:
                    non_zeros.append(i+1)
            output.append(' '.join(map(str, non_zeros)))
convert_to_csv(output)