In [78]:
import json
import copy
import random
import pickle
import math
import pandas as pd
import numpy as np

from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV
from BaseSVDD import BaseSVDD
from multiprocessing import Pool, cpu_count
from sklearn.decomposition import KernelPCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from scipy import sparse

# Data R/W

In [79]:
# Openinng the JSON file
openAllData = open('data/train.json');
openTestData = open('data/test.json');

# returns JSON object as
# a dictionary
allData = json.load(openAllData);
testData = json.load(openTestData);

openAllData.close();
openTestData.close();

In [80]:
allData_str_list = []
for k1, v1 in allData.items():
    templist = []
    templist.append("year_" + str(v1["year"]))
    templist.append("venue_" + str(v1["venue"]))
    templist.extend(["keywords_" + str(keyword) for keyword in v1["keywords"]])
#     templist.extend(["author_" + str(author) for author in v1["author"]])
#     templist.extend(v1["author"])
    tempstr = ','.join(templist)
    allData_str_list.append([tempstr, v1["author"]])  

# pre processing

## train test split

In [81]:
x_allData_str = [row[0] for row in allData_str_list]
y_allData_str = [row[1] for row in allData_str_list]

X_train, X_dev, y_train, y_dev = train_test_split(x_allData_str, y_allData_str, 
                                                    test_size=2000, random_state=90051, shuffle=True)

## TFIDF 

In [82]:
NUM_YEAR = 20
START_YEAR = 2000
NUM_VENUE = 470
NUM_KEYWORDS = 500
NUM_AUTHOR = 2302
vocabulary = {}
vocab_index = 0
for i in range(NUM_YEAR):
    vocabulary["year_"+str(START_YEAR+i)] = vocab_index
    vocab_index+=1
for i in range(NUM_VENUE):
    vocabulary["venue_"+str(i)] = vocab_index
    vocab_index+=1
for i in range(NUM_KEYWORDS):
    vocabulary["keywords_"+str(i)] = vocab_index
    vocab_index+=1
# for i in range(NUM_AUTHOR):
#     vocabulary["author_"+str(i)] = vocab_index
#     vocab_index+=1

In [83]:
def string_comma_split(string):
    return string.split(",")
tfidf_vectorizer = TfidfVectorizer(tokenizer=string_comma_split, vocabulary=vocabulary)

In [84]:
tfidf_vectorizer.fit(X_train)



TfidfVectorizer(tokenizer=<function string_comma_split at 0x000002972431D3A0>,
                vocabulary={'keywords_0': 490, 'keywords_1': 491,
                            'keywords_10': 500, 'keywords_100': 590,
                            'keywords_101': 591, 'keywords_102': 592,
                            'keywords_103': 593, 'keywords_104': 594,
                            'keywords_105': 595, 'keywords_106': 596,
                            'keywords_107': 597, 'keywords_108': 598,
                            'keywords_109': 599, 'keywords_11': 501,
                            'keywords_110': 600, 'keywords_111': 601,
                            'keywords_112': 602, 'keywords_113': 603,
                            'keywords_114': 604, 'keywords_115': 605,
                            'keywords_116': 606, 'keywords_117': 607,
                            'keywords_118': 608, 'keywords_119': 609,
                            'keywords_12': 502, 'keywords_120': 610,
                  

In [87]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)

In [88]:
def y_transform(y_list):
    y_transform = np.zeros((len(y_list), 2302))
    for i in range(len(y_list)):
        for j in y_list[i]:
            y_transform[i][j] = 1
    y_transform = sparse.csr_matrix(y_transform)
    return y_transform

In [89]:
y_train_transform = y_transform(y_train)

## author PMI

In [91]:
def co_occurance(author_list):
    tempdict = defaultdict(int)
    
    for authors in author_list:
        for index1 in range(len(authors)):
            for index2 in range(index1, len(authors)):
                key = tuple(sorted([authors[index1], authors[index2]]))
                tempdict[key] += 1
    
    vocab = list(range(2302))
    df = pd.DataFrame(data=np.zeros((2302, 2302), dtype=np.int16),index=vocab,columns=vocab)
    for key, value in tempdict.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

def pmi(df, positive=True):
    col_totals = df.sum(axis=0)
#     total = col_totals.sum()
    row_totals = df.sum(axis=1)
    total = np.zeros((len(row_totals), len(col_totals)))
    
    for i in range(len(row_totals)):
        for j in range(len(col_totals)):
            total[i][j] = row_totals[i] + col_totals[j] + 0.1
            
    for i in range(len(total)):
        for j in range(len(total[i])):
            if total[i][j] == 0:
                total[i][j] = math.inf
    expected = np.outer(row_totals, col_totals) / total
    
    for i in range(len(expected)):
        for j in range(len(expected[i])):
            if expected[i][j] == 0:
                expected[i][j] = math.inf
    df = df / expected
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = -1  # log(0) = 0
    if positive:
        df[df < 0] = 0.0
#     print(total)
    return df

def freq_proba(df):
    author_totals = df.sum(axis=1)
    freq_df = copy.deepcopy(df)
    freq_df.to_numpy()
    for row in range(len(freq_df)):
        for col in range(len(freq_df[row])):
            freq_df[row][col] == freq_df[row][col]/author_totals[col]
    return freq_df

def scale_to_05_1(num):
    return (num**2 + 1)/2

def sigmoid_my(num):
    return 1/(1 + np.exp(-num))    

In [93]:
df = co_occurance(y_train)
ppmi = pmi(df, positive=True)
ppmi_sigmoid = sigmoid_my(ppmi)

## author post proba
use the co-occurence freq to fix the proba

In [94]:
df = co_occurance(y_allData_str)
freq_df = freq_proba(df)
freq_df_scale = scale_to_05_1(freq_df)

# train

In [95]:
from sklearn.ensemble import RandomForestClassifier
model = LinearSVC()
clf = OneVsRestClassifier(model, n_jobs=-1)

In [96]:
clf.fit(X_train_tfidf, y_train_transform.toarray())

OneVsRestClassifier(estimator=LinearSVC(), n_jobs=-1)

## processing dev data

In [22]:
X_dev_tfidf = tfidf_vectorizer.transform(X_dev)

In [23]:
y_dev_decision_function = clf.decision_function(X_dev_tfidf)

In [28]:
y_dev_proba_raw = sigmoid_my(y_dev_decision_function)

## result dev

In [29]:
def dev_get_author_target(y_dev):
    target_list = []
    coauthor = []
    for author_list in y_dev:
        templist = copy.deepcopy(author_list)
        [target] = random.sample(author_list, 1)
        target_list.append(target)
        templist.remove(target)
        coauthor.append(templist)
    with_false = copy.deepcopy(target_list)
    for i in range(0, 2000, 2):
        all = set(list(range(2032)))
        temp = set(y_dev[i])
        [with_false[i]] = random.sample(list(all - temp), 1)
    return target_list, coauthor, with_false
true_target_list, coauthor_list, target_list = dev_get_author_target(y_dev)

In [48]:
def max_freq(freq_df_scale, coauthor_list, target):
    max_freq = 0.5
    for coauthor in coauthor_list:
        max_freq = max(max_freq, freq_df_scale[target][coauthor])
#     if max_ppmi>0.5:
#         print(max_ppmi)
    return max_freq

def max_ppmi(data_ppmi_sigmoid, coauthor_list, target):
    max_ppmi = 0.5
    for coauthor in coauthor_list:
        max_ppmi = max(max_ppmi, data_ppmi_sigmoid[target][coauthor])
#     if max_ppmi>0.5:
#         print(max_ppmi)
    return max_ppmi
    
def raw_to_final(target_list, coauthor_list, data_proba_row, data_ppmi_sigmoid):
    results = []
    for i in range(len(target_list)):
        target = target_list[i]
        results.append(data_proba_row[i][target] * max_freq(freq_df_scale, coauthor_list[i], target)) 
        # if u want to use pmi to adjust the proba, use func max_ppmi instead of max_freq
    return results       

In [54]:
results = []
y_result = []
for i in range(len(target_list)):
    results.append(y_dev_proba_raw[i][target_list[i]])
    if target_list[i] == true_target_list[i]:
        y_result.append(1)
    else:
        y_result.append(0)
        
results = raw_to_final(target_list, coauthor_list, y_dev_proba_raw, freq_df_scale)

In [55]:
roc_auc_score(y_result, results)

0.9390999999999999

# 参数优化

In [34]:
# from sklearn.preprocessing import MinMaxScaler #if its a dense matrix else use MaxAbsScaler in case of sparse matrix

# C_range = np.logspace(-2, 5, 8)
# gamma_range = np.logspace(-6, 1, 1)
# auc_C_gamma = defaultdict(float)

# import multiprocessing as mp

# for C in C_range:
#     svc = LinearSVC()
#     clf_c_gamma = OneVsRestClassifier(svc, n_jobs=-1)
#     clf_c_gamma.fit(X_train_tfidf, y_train_transform)
#     y_dev_decision_function = clf_c_gamma.decision_function(X_dev_tfidf)
#     y_dev_proba_raw = sigmoid_my(y_dev_decision_function)
#     results = []
#     for i in range(len(target_list)):
#         results.append(y_dev_proba_raw[i][target_list[i]])
#     auc_C_gamma[C] = roc_auc_score(y_result, results)
#     print(str(C) + ": "+ str(auc_C_gamma[C]))

0.01: 0.882266
0.1: 0.882263
1.0: 0.882262
10.0: 0.882261
100.0: 0.882265
1000.0: 0.882266
10000.0: 0.882263
100000.0: 0.882263


In [None]:
# for C in C_range:
#     for gamma in gamma_range:
#         svc = LinearSVC(C=C, kernel="rbf", gamma=gamma)
#         clf_c_gamma = OneVsRestClassifier(svc, n_jobs=-1)
#         clf_c_gamma.fit(X_train_tfidf, y_train_transform)
#         y_dev_decision_function = clf.decision_function(X_dev_tfidf)
#         y_dev_proba_raw = sigmoid_my(y_dev_decision_function)
#         results = []
#         for i in range(len(target_list)):
#             results.append(y_dev_proba_raw[i][target_list[i]])
#         auc_C_gamma[(C, gamma)] = roc_auc_score(y_result, results)
#         print(auc_C_gamma[(C, gamma)])
# def multi_run(C):
#     for gamma in gamma_range:
#         X_dev_tfidf_copy = copy.deepcopy(X_dev_tfidf)
#         X_train_tfidf_copy = copy.deepcopy(X_train_tfidf)
#         y_train_transform_cpoy = copy.deepcopy(y_train_transform)
#         target_list_copy = copy.deepcopy(target_list)
#         print("C:"+ str(C) +", gamma:" + str(gamma))
#         svc = SVC(C=C, kernel="rbf", gamma=gamma)
#         clf_c_gamma = OneVsRestClassifier(svc, n_jobs=1)
#         clf_c_gamma.fit(X_train_tfidf_copy, y_train_transform_cpoy)
#         y_dev_decision_function = clf_c_gamma.decision_function(X_dev_tfidf_copy)
#         y_dev_proba_raw = sigmoid_my(y_dev_decision_function)
#         results = []
#         for i in range(len(target_list_copy)):
#             results.append(y_dev_proba_raw[i][target_list_copy[i]])
#         print({(C, gamma): roc_auc_score(y_result, results)})
#         return {(C, gamma): roc_auc_score(y_result, results)}

In [25]:
# multi_run(1)

C:1, gamma:1e-06
{(1, 1e-06): 0.8851839999999999}


{(1, 1e-06): 0.8851839999999999}

In [None]:
# import multiprocessing as mp

# num_cores = int(mp.cpu_count())
# print("This computer have " + str(num_cores) + " cores")
# pool = mp.Pool(num_cores)
# all_roc = [pool.apply_async(multi_run, args=C) for C in C_range]
# all_roc_result = [p.get() for p in all_roc]

This computer have 8 cores


# test and save

In [71]:
testData_str_list = []
for k1, v1 in testData.items():
    templist = []
    templist.append("year_" + str(v1["year"]))
    templist.append("venue_" + str(v1["venue"]))
    templist.extend(["keywords_" + str(keyword) for keyword in v1["keywords"]])
    tempstr = ','.join(templist)
    testData_str_list.append([tempstr,v1["target"], v1["coauthor"]])  

In [72]:
x_testData_str = [row[0] for row in testData_str_list]
y_testData = [row[1] for row in testData_str_list]
coauthor_testData = [row[2] for row in testData_str_list]

In [73]:
X_test_tfidf = tfidf_vectorizer.transform(x_testData_str)

In [74]:
y_test_decision_function = clf.decision_function(X_test_tfidf)

In [75]:
y_test_proba_raw = sigmoid_my(y_test_decision_function)

In [76]:
results = raw_to_final(y_testData, coauthor_testData, y_test_proba_raw, freq_df_scale)

In [77]:
import csv

header = ['Id','Predicted']
data = []
idNumber = 0
for i in results:
    data.append([idNumber,i])
    idNumber = idNumber+1

filename = 'multi_label_LinearSVM_FREQ.csv'
with open(filename, 'w', newline="") as file:
    csvwriter = csv.writer(file)
    csvwriter.writerow(header)
    csvwriter.writerows(data)

In [175]:
ppmi_sigmoid

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2292,2293,2294,2295,2296,2297,2298,2299,2300,2301
0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2298,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2299,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2300,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
