## Project 2 Multi-Label GNB Classification Method

In [1]:
import json
import numpy as np
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import collections
from sklearn.svm import LinearSVC
from math import log10
from time import time
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from SMLP2_Func import generate_coauthors, onehot_func, coauthors_onehot, decode_func, convert_to_csv



## Read in Training and Test Data as Dataframe

In [2]:
f = open('train.json')
train_data = json.load(f)
train_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
train_y = pd.DataFrame(columns=["authors"])
authors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(train_data)): 
    authors_list.append(np.array(train_data[i]['authors']))
    year_list.append(np.array(train_data[i]['year']))
    venue_list.append(np.array(train_data[i]['venue']))
    title_list.append(np.array(train_data[i]['title']))
    abstract_list.append(np.array(train_data[i]['abstract']))

train_y["authors"] = authors_list
train_X["year"] = year_list
train_X["venue"] = venue_list
train_X["title"] = title_list
train_X["abstract"] = abstract_list

In [3]:
f = open('test.json')
test_data = json.load(f)
test_X = pd.DataFrame(columns=["year", "venue", "title", "abstract"])
coauthors_list = []
year_list = []
venue_list = []
title_list = []
abstract_list = []
for i in range(len(test_data)): 
    coauthors_list.append(np.array(test_data[i]['coauthors']))
    year_list.append(np.array(test_data[i]['year']))
    venue_list.append(np.array(test_data[i]['venue']))
    title_list.append(np.array(test_data[i]['title']))
    abstract_list.append(np.array(test_data[i]['abstract']))

test_X["year"] = year_list
test_X["venue"] = venue_list
test_X["title"] = title_list
test_X["abstract"] = abstract_list
test_X["coauthors"] = coauthors_list

## Define Variables

In [4]:
wordbag_len = 4999
authors_num = 21245
prolific_num = 100
venue_num = 465

## Splitting Authors list into Response (Profilic Authors) & Predictors (Co-Authors)

In [5]:
prolific_authors_list = []
coauthors_list = []
for authors in authors_list:
    prolific_authors, coauthors = generate_coauthors(authors)
    prolific_authors_list.append(prolific_authors)
    coauthors_list.append(coauthors)

train_y["authors"] = prolific_authors_list
train_X["coauthors"] = coauthors_list

## Convert Features 'title', 'abstract', 'coauthors' & 'authors' into One-hot Encoding Style

In [6]:
train_X = onehot_func(wordbag_len, train_X, "title")
train_X = onehot_func(wordbag_len, train_X, "abstract")
train_X = onehot_func(authors_num, train_X, "coauthors")
train_X = coauthors_onehot(prolific_num, train_X, "coauthors")
train_X = onehot_func(venue_num, train_X, "venue")
train_y = onehot_func(prolific_num, train_y, "authors")

test_X = onehot_func(wordbag_len, test_X, "title")
test_X = onehot_func(wordbag_len, test_X, "abstract")
test_X = onehot_func(authors_num, test_X, "coauthors")
test_X = coauthors_onehot(prolific_num, test_X, "coauthors")
test_X = onehot_func(venue_num, test_X, "venue")

## Generate Training & Test Data

In [7]:
coauthors_list = []
for title in np.array(train_X['coauthors']): 
    coauthors_list.append(list(title))
x_list = np.array(coauthors_list)

y_list = []
for title in np.array(train_y['authors']): 
    y_list.append(list(title))
y_list = np.array(y_list)

coauthors_list_test = []
for title in np.array(test_X['coauthors']): 
    coauthors_list_test.append(list(title))
x_list_test = np.array(coauthors_list_test)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x_list, y_list, test_size=0.33, random_state=1)

In [17]:
classifier = LabelPowerset(GaussianNB())

In [18]:
pipeline = Pipeline(
    [
         ('clf', classifier)
    ]
)

parameters = {
    'clf__classifier__var_smoothing': (1,1e-1,1e-2,1e-3),
}

def grid_search(pipeline, parameter, X, y):
    gs = GridSearchCV(pipeline, parameter, cv = 3, n_jobs = -1, verbose = 1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameter:")
    print(parameters)
    t0 = time()
    gs.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    mean_score = gs.cv_results_['mean_test_score']
    param_set = gs.cv_results_['params']
    for idx in mean_score.argsort()[-5:]:
        print(param_set[idx])
        print(gs.cv_results_['mean_test_score'][idx])
        print("="*30)
    return gs

In [19]:
result = grid_search(pipeline, parameters, X_train, y_train)

Performing grid search...
pipeline: ['clf']
parameter:
{'clf__classifier__var_smoothing': (1, 0.1, 0.01, 0.001)}
Fitting 3 folds for each of 4 candidates, totalling 12 fits


## Process the Model Fitting & Multi-Label Classification

In [None]:
classifier = LabelPowerset(GaussianNB(var_smoothing=0.0015))
classifier.fit(x_list, y_list)
y_test_pred = classifier.predict(x_list_test)
y_pred_ids = decode_func(y_test_pred)
convert_to_csv(y_pred_ids)