# GNB with Majority Voting

In [1]:
import json
import numpy as np
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC
import collections
from math import log10
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from SMLP2_Func import generate_coauthors, onehot_func, coauthors_onehot, decode_func, convert_to_csv

## Read in Training and Test Data as Dataframe

In [2]:
f = open('train.json')
train_data = json.load(f)
train_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
train_y = pd.DataFrame(columns=["authors"])
authors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(train_data)): 
    authors_list.append(np.array(train_data[i]['authors']))
    year_list.append(np.array(train_data[i]['year']))
    venue_list.append(np.array(train_data[i]['venue']))
    title_list.append(np.array(train_data[i]['title']))
    abstract_list.append(np.array(train_data[i]['abstract']))

train_y["authors"] = authors_list
train_X["year"] = year_list
train_X["venue"] = venue_list
train_X["title"] = title_list
train_X["abstract"] = abstract_list

In [3]:
f = open('test.json')
test_data = json.load(f)
test_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
coauthors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(test_data)): 
    coauthors_list.append(np.array(test_data[i]['coauthors']))
    year_list.append(np.array(test_data[i]['year']))
    venue_list.append(np.array(test_data[i]['venue']))
    title_list.append(np.array(test_data[i]['title']))
    abstract_list.append(np.array(test_data[i]['abstract']))

test_X["year"] = year_list
test_X["venue"] = venue_list
test_X["title"] = title_list
test_X["abstract"] = abstract_list
test_X["coauthors"] = coauthors_list

## Define Variables

In [4]:
wordbag_len = 4999
authors_num = 21245
profilic_num = 100
profilic_end = 99
venue_num = 465

## Splitting Authors' list into Response (Profilic Authors) & Predictors (Co-Authors)

In [5]:
prolific_authors_list = []
coauthors_list = []
for authors in authors_list:
    prolific_authors, coauthors = generate_coauthors(authors)
    prolific_authors_list.append(prolific_authors)
    coauthors_list.append(coauthors)

train_y["authors"] = prolific_authors_list
train_X["coauthors"] = coauthors_list
# for i in range(len(train_y)): 
#     if len(train_y["authors"][i]) >= 1: 
#         train_y["authors"][i] = [train_y["authors"][i][0]]

## Convert Features 'title', 'abstract', 'coauthors' & 'authors' into One-hot Encoding Style

In [6]:
train_X = onehot_func(wordbag_len, train_X, "title")
train_X = onehot_func(wordbag_len, train_X, "abstract")
train_X = onehot_func(authors_num, train_X, "coauthors")
train_X = coauthors_onehot(profilic_end, train_X, "coauthors")
train_X = onehot_func(venue_num, train_X, "venue")
train_y = onehot_func(profilic_num, train_y, "authors")

test_X = onehot_func(wordbag_len, test_X, "title")
test_X = onehot_func(wordbag_len, test_X, "abstract")
test_X = onehot_func(authors_num, test_X, "coauthors")
test_X = coauthors_onehot(profilic_end, test_X, "coauthors")
test_X = onehot_func(venue_num, test_X, "venue")

In [7]:
coauthors_list = []
for title in np.array(train_X['coauthors']): 
    coauthors_list.append(list(title))
x_list = np.array(coauthors_list)

# venue_list = []
# for title in np.array(train_X['venue']): 
#     venue_list.append(list(title))
# venue_array = np.array(venue_list)

# x_list = np.concatenate((coauthors_array, venue_array), axis = 1)

y_list = []
for title in np.array(train_y['authors']): 
    y_list.append(list(title))
y_list = np.array(y_list)

coauthors_list_test = []
for title in np.array(test_X['coauthors']): 
    coauthors_list_test.append(list(title))
x_list_test = np.array(coauthors_list_test)

# venue_list_test = []
# for title in np.array(test_X['venue']): 
#     venue_list_test.append(list(title))
# venue_test_array = np.array(venue_list_test)

# x_list_test = np.concatenate((coauthors_test_array, venue_test_array), axis = 1)

In [8]:
random_state = 100
result_dict = {}
for i in range(len(test_X)):
    result_dict[i] = []
for random_state in range(1, random_state):
    X_train, X_test, y_train, y_test = train_test_split(x_list, y_list, test_size=0.33, random_state=random_state)
    classifier = LabelPowerset(GaussianNB())
    classifier.fit(X_train, y_train)
    y_test_pred = classifier.predict(x_list_test)
    y_pred_ids = decode_func(y_test_pred)
    output = pd.DataFrame(columns=["Id", "Predict"])
    output['Id'] = range(len(y_pred_ids))
    output["Predict"] = y_pred_ids
    for index, row in output.iterrows():
        result_dict[index].append(row['Predict'])

In [9]:
output_list = []
for i in range(len(result_dict)):
    counter = collections.Counter(result_dict[i])
    output_list.append(counter.most_common(1)[0][0])
convert_to_csv(output_list)