In [22]:
import json
import copy
import random
import pickle
import math
import pandas as pd
import numpy as np

from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV
from BaseSVDD import BaseSVDD
from multiprocessing import Pool, cpu_count
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from scipy import sparse
from sklearn.multioutput import ClassifierChain

# Data R/W

In [2]:
# Openinng the JSON file
openAllData = open('data/train.json');
openTestData = open('data/test.json');

# returns JSON object as a dictionary
allData = json.load(openAllData);
testData = json.load(openTestData);

openAllData.close();
openTestData.close();

Treat year and venue also as keywords:

In [3]:
allData_str_list = []
for k1, v1 in allData.items():
    templist = []
    templist.append("year_" + str(v1["year"]))
    templist.append("venue_" + str(v1["venue"]))
    templist.extend(["keywords_" + str(keyword) for keyword in v1["keywords"]])
#     templist.extend(["author_" + str(author) for author in v1["author"]])
#     templist.extend(v1["author"])
    tempstr = ','.join(templist)
    allData_str_list.append([tempstr, v1["author"]]) 

# Train test split

In [4]:
x_allData_str = [row[0] for row in allData_str_list]
y_allData_str = [row[1] for row in allData_str_list]

X_train, X_dev, y_train, y_dev = train_test_split(x_allData_str, y_allData_str, 
                                                    test_size=2000, random_state=90051, shuffle=True)

# Data processing

### TFIDF

In [5]:
NUM_YEAR = 20
START_YEAR = 2000
NUM_VENUE = 470
NUM_KEYWORDS = 500
NUM_AUTHOR = 2302
vocabulary = {}
vocab_index = 0
for i in range(NUM_YEAR):
    vocabulary["year_"+str(START_YEAR+i)] = vocab_index
    vocab_index+=1
for i in range(NUM_VENUE):
    vocabulary["venue_"+str(i)] = vocab_index
    vocab_index+=1
for i in range(NUM_KEYWORDS):
    vocabulary["keywords_"+str(i)] = vocab_index
    vocab_index+=1
    
def string_comma_split(string):
    return string.split(",")

tfidf_vectorizer = TfidfVectorizer(tokenizer=string_comma_split, vocabulary=vocabulary)

tfidf_vectorizer.fit(X_train)



TfidfVectorizer(tokenizer=<function string_comma_split at 0x00000253874BA8B0>,
                vocabulary={'keywords_0': 490, 'keywords_1': 491,
                            'keywords_10': 500, 'keywords_100': 590,
                            'keywords_101': 591, 'keywords_102': 592,
                            'keywords_103': 593, 'keywords_104': 594,
                            'keywords_105': 595, 'keywords_106': 596,
                            'keywords_107': 597, 'keywords_108': 598,
                            'keywords_109': 599, 'keywords_11': 501,
                            'keywords_110': 600, 'keywords_111': 601,
                            'keywords_112': 602, 'keywords_113': 603,
                            'keywords_114': 604, 'keywords_115': 605,
                            'keywords_116': 606, 'keywords_117': 607,
                            'keywords_118': 608, 'keywords_119': 609,
                            'keywords_12': 502, 'keywords_120': 610,
                  

In [6]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)

In [7]:
def y_transform(y_list):
    y_transform = np.zeros((len(y_list), 2302))
    for i in range(len(y_list)):
        for j in y_list[i]:
            y_transform[i][j] = 1
    y_transform = sparse.csr_matrix(y_transform)
    return y_transform

y_train_transform = y_transform(y_train)

### Author PMI

In [8]:
def co_occurance(author_list):
    tempdict = defaultdict(int)
    
    for authors in author_list:
        for index1 in range(len(authors)):
            for index2 in range(index1, len(authors)):
                key = tuple(sorted([authors[index1], authors[index2]]))
                tempdict[key] += 1
    
    vocab = list(range(2302))
    df = pd.DataFrame(data=np.zeros((2302, 2302), dtype=np.int16),index=vocab,columns=vocab)
    for key, value in tempdict.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

def pmi(df, positive=True):
    col_totals = df.sum(axis=0)
#     total = col_totals.sum()
    row_totals = df.sum(axis=1)
    total = np.zeros((len(row_totals), len(col_totals)))
    
    for i in range(len(row_totals)):
        for j in range(len(col_totals)):
            total[i][j] = row_totals[i] + col_totals[j] + 0.1
            
    for i in range(len(total)):
        for j in range(len(total[i])):
            if total[i][j] == 0:
                total[i][j] = math.inf
    expected = np.outer(row_totals, col_totals) / total
    
    for i in range(len(expected)):
        for j in range(len(expected[i])):
            if expected[i][j] == 0:
                expected[i][j] = math.inf
    df = df / expected
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = -1  # log(0) = 0
    if positive:
        df[df < 0] = 0.0
#     print(total)
    return df

def freq_proba(df):
    author_totals = df.sum(axis=1)
    freq_df = copy.deepcopy(df)
    freq_df.to_numpy()
    for row in range(len(freq_df)):
        for col in range(len(freq_df[row])):
            freq_df[row][col] == freq_df[row][col]/author_totals[col]
    return freq_df

def scale_to_05_1(num):
    return (num**2 + 1)/2

def sigmoid_my(num):
    return 1/(1 + np.exp(-num))    

In [9]:
df = co_occurance(y_train)
ppmi = pmi(df, positive=True)
ppmi_sigmoid = sigmoid_my(ppmi)

### Author post proba

In [10]:
df = co_occurance(y_allData_str)
freq_df = freq_proba(df)
freq_df_scale = scale_to_05_1(freq_df)

# Traininig stage

### Logistic regression

In [11]:
clf_LR = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
clf_LR.fit(X_train_tfidf, y_train_transform)

OneVsRestClassifier(estimator=LogisticRegression(), n_jobs=-1)

### LinearSVC

In [12]:
clf_LSVC = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
clf_LSVC.fit(X_train_tfidf, y_train_transform)

OneVsRestClassifier(estimator=LinearSVC(), n_jobs=-1)

# Evaluation stage

### Preparing dev data

In [13]:
X_dev_tfidf = tfidf_vectorizer.transform(X_dev)

# LR
y_dev_decision_function_LR = clf_LR.decision_function(X_dev_tfidf)
y_dev_proba_raw_LR = sigmoid_my(y_dev_decision_function_LR)

# LSVC
y_dev_decision_function_LSVC = clf_LSVC.decision_function(X_dev_tfidf)
y_dev_proba_raw_LSVC = sigmoid_my(y_dev_decision_function_LSVC)

In [14]:
def dev_get_author_target(y_dev):
    target_list = []
    coauthor = []
    for author_list in y_dev:
        templist = copy.deepcopy(author_list)
        [target] = random.sample(author_list, 1)
        target_list.append(target)
        templist.remove(target)
        coauthor.append(templist)
    with_false = copy.deepcopy(target_list)
    for i in range(0, 2000, 2):
        all = set(list(range(2032)))
        temp = set(y_dev[i])
        [with_false[i]] = random.sample(list(all - temp), 1)
    return target_list, coauthor, with_false

true_target_list, coauthor_list, target_list = dev_get_author_target(y_dev)

In [15]:
def max_freq(freq_df_scale, coauthor_list, target):
    max_freq = 0.5
    for coauthor in coauthor_list:
        max_freq = max(max_freq, freq_df_scale[target][coauthor])
#     if max_ppmi>0.5:
#         print(max_ppmi)
    return max_freq

def max_ppmi(data_ppmi_sigmoid, coauthor_list, target):
    max_ppmi = 0.5
    for coauthor in coauthor_list:
        max_ppmi = max(max_ppmi, data_ppmi_sigmoid[target][coauthor])
#     if max_ppmi>0.5:
#         print(max_ppmi)
    return max_ppmi
    
def raw_to_final(target_list, coauthor_list, data_proba_row, data_ppmi_sigmoid):
    results = []
    for i in range(len(target_list)):
        target = target_list[i]
        results.append(data_proba_row[i][target] * max_freq(freq_df_scale, coauthor_list[i], target)) 
        # if u want to use pmi to adjust the proba, use func max_ppmi instead of max_freq
    return results     

### Compute AUC

### LR

In [16]:
results_LR = []
y_result_LR = []
for i in range(len(target_list)):
    results_LR.append(y_dev_proba_raw_LR[i][target_list[i]])
    if target_list[i] == true_target_list[i]:
        y_result_LR.append(1)
    else:
        y_result_LR.append(0)

results_LR = raw_to_final(target_list, coauthor_list, y_dev_proba_raw_LR, freq_df_scale)
        
roc_auc_score(y_result_LR, results_LR)

0.9140430000000002

### LSVC

In [17]:
results_LSVC = []
y_result_LSVC = []
for i in range(len(target_list)):
    results_LSVC.append(y_dev_proba_raw_LSVC[i][target_list[i]])
    if target_list[i] == true_target_list[i]:
        y_result_LSVC.append(1)
    else:
        y_result_LSVC.append(0)

results_LSVC = raw_to_final(target_list, coauthor_list, y_dev_proba_raw_LSVC, freq_df_scale)    
        
roc_auc_score(y_result_LSVC, results_LSVC)

0.9388099999999999

The LinearSVC model performs better than the LogisticRegression model. Therefore, we select LSVC as the base model for further optimization

# Parameter tuning

TBA

# Experiment - Classifier chain

In [31]:
chains = [ClassifierChain(LinearSVC(), order="random", random_state=i) for i in range(5)]

In [37]:
chain_order = 1 
for chain in chains:
    chain.fit(X_train_tfidf, y_train_transform.toarray())
    y_dev_decision_function = chain.decision_function(X_dev_tfidf)
    y_dev_proba_raw = sigmoid_my(y_dev_decision_function)
    
    results = []
    y_result = []
    for i in range(len(target_list)):
        results.append(y_dev_proba_raw[i][target_list[i]])
        if target_list[i] == true_target_list[i]:
            y_result.append(1)
        else:
            y_result.append(0)

    results = raw_to_final(target_list, coauthor_list, y_dev_proba_raw, freq_df_scale)    

    auc = roc_auc_score(y_result, results)
    
    print("Chaining order " + str(chain_order) + ": AUC = " + str(auc))
    chain_order += 1

Chaining order 1: AUC = 0.9350169999999999
Chaining order 2: AUC = 0.9357959999999999
Chaining order 3: AUC = 0.936123
Chaining order 4: AUC = 0.9379749999999999
Chaining order 5: AUC = 0.935178


Classifier chaining approach performs no better than the much easier and much faster binary relevance one vs rest approach. In addition, there is also very small variations regardless of different orders of chaining.

# Test set prediction

In [42]:
clf = clf_LR # best classifier

In [43]:
testData_str_list = []
for k1, v1 in testData.items():
    templist = []
    templist.append("year_" + str(v1["year"]))
    templist.append("venue_" + str(v1["venue"]))
    templist.extend(["keywords_" + str(keyword) for keyword in v1["keywords"]])
    tempstr = ','.join(templist)
    testData_str_list.append([tempstr,v1["target"], v1["coauthor"]])  


In [44]:
x_testData_str = [row[0] for row in testData_str_list]
y_testData = [row[1] for row in testData_str_list]
coauthor_testData = [row[2] for row in testData_str_list]

In [45]:
X_test_tfidf = tfidf_vectorizer.transform(x_testData_str)
y_test_decision_function = clf.decision_function(X_test_tfidf)
y_test_proba_raw = sigmoid_my(y_test_decision_function)
results = raw_to_final(y_testData, coauthor_testData, y_test_proba_raw, freq_df_scale)

In [47]:
import csv

header = ['Id','Predicted']
data = []
idNumber = 0
for i in results:
    data.append([idNumber,i])
    idNumber = idNumber+1

filename = 'multi_label_LinearSVM_FREQ.csv'
with open(filename, 'w', newline="") as file:
    csvwriter = csv.writer(file)
    csvwriter.writerow(header)
    csvwriter.writerows(data)