In [36]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

with open('data/train.json') as f_train:
    train_data = json.load(f_train)

# with open('data/test.json') as f_test:
#     test_data = json.load(f_test)

# Transpose index and columns
df = pd.DataFrame(train_data).T

# View the dataframe
df

Unnamed: 0,venue,keywords,year,author
0,,"[64, 1, 322, 134, 136, 396, 270, 144, 476, 481...",2017,"[1605, 759]"
1,0,"[258, 260, 389, 261, 390, 396, 400, 17, 146, 2...",2013,[2182]
2,1,"[320, 454, 266, 462, 17, 339, 404, 342, 407, 2...",2007,[2176]
3,2,"[260, 132, 333, 15, 400, 272, 146, 401, 278, 3...",2013,[1107]
4,3,"[64, 385, 449, 450, 71, 73, 268, 80, 216, 25, ...",2009,[1414]
...,...,...,...,...
26103,252,"[384, 320, 136, 457, 75, 17, 146, 465, 468, 21...",2011,"[656, 595]"
26104,50,"[318, 70, 457, 459, 396, 77, 146, 404, 468, 40...",2008,[876]
26105,6,"[320, 260, 69, 9, 265, 461, 156, 476, 166, 425...",2008,[535]
26106,138,"[450, 70, 198, 233, 394, 300, 492, 368, 246, 4...",2015,[1954]


In [37]:
# Notice that keywords and author now stores lists of int64,
# Convert them into a single string => later for CountVectorizer
# This is because the tokenizer function is designed to process strings only

def join_list(l):
    l_str = []
    for x in l:
        l_str.append(str(x))
    return ','.join(l_str)

df['keywords'] = df['keywords'].apply(join_list)
df['author'] = df['author'].apply(join_list)

# Notice the venue stores empty string "", replace it with np.NaN
df = df.replace('', np.NaN)

# View updated dataframe
df



Unnamed: 0,venue,keywords,year,author
0,,"64,1,322,134,136,396,270,144,476,481,165,39,36...",2017,1605759
1,0.0,"258,260,389,261,390,396,400,17,146,274,21,283,...",2013,2182
2,1.0,"320,454,266,462,17,339,404,342,407,25,154,474,...",2007,2176
3,2.0,"260,132,333,15,400,272,146,401,278,342,25,346,...",2013,1107
4,3.0,"64,385,449,450,71,73,268,80,216,25,161,226,166...",2009,1414
...,...,...,...,...
26103,252.0,"384,320,136,457,75,17,146,465,468,212,342,151,...",2011,656595
26104,50.0,"318,70,457,459,396,77,146,404,468,407,87,474,9...",2008,876
26105,6.0,"320,260,69,9,265,461,156,476,166,425,300,301,3...",2008,535
26106,138.0,45070198233394300492368246406154156,2015,1954


If you want to read about CountVectorizer:  
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.build_tokenizer

In [38]:
def string_comma_split(string):
    return string.split(",")

NUM_AUTHORS = 2302
author_vocabulary = {}
for i in range(NUM_AUTHORS):
    author_vocabulary[str(i)] = i
        
# 
author_vectorizer = CountVectorizer(tokenizer=string_comma_split, vocabulary=author_vocabulary)
y_multilabel = author_vectorizer.fit_transform(df['author'])

API for scipy sparse matrix:  
https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

In [39]:
# You may skip the following cell
# example check: author ids of paper 26103:
instance = y_multilabel.toarray()[26103]
for i in range(NUM_AUTHORS):
    if instance[i] == 1:
        print(i)

595
656


In [61]:
#WARINING: should use StratifiedShuffleSplit to search for best kernal parameters, for now, just 
X_train, X_valid, y_train, y_valid = train_test_split(df, y_multilabel, test_size=0.9, random_state=18)

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

NUM_KEYWORDS = 500
keyword_vocabulary = {}
for i in range(NUM_KEYWORDS):
    keyword_vocabulary[str(i)] = i

keyword_vectorizer = TfidfVectorizer(tokenizer=string_comma_split, vocabulary=keyword_vocabulary)
X_train_transform = keyword_vectorizer.fit_transform(X_train["keywords"])
X_valid_transform = keyword_vectorizer.fit_transform(X_valid["keywords"])

In [63]:
# v = CountVectorizer(tokenizer=string_comma_split, vocabulary=keyword_vocabulary)
# test = v.fit_transform(df["keywords"])

# instance = test.toarray()[26106]
# for i in range(NUM_KEYWORDS):
#     if instance[i] == 1:
#         print(i)

For multilabel classification:
https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html

In [64]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)

In [65]:
clf.fit(X_train_transform, y_train)

OneVsRestClassifier(estimator=LogisticRegression(), n_jobs=-1)

In [86]:
y_train_pred_proba = clf.predict_proba(X_train_transform)
y_valid_pred_proba = clf.predict_proba(X_valid_transform)

In [87]:
from sklearn.metrics import roc_auc_score

In [84]:
# roc_auc_score_train = roc_auc_score(y_train.toarray(), y_train_pred_proba,average='weighted')
# roc_auc_score_test = roc_auc_score(y_valid.toarray(), y_valid_pred_proba,average='weighted')