# COMP90051 Project 2 Authorship Identification #

# Import library

In [1]:
# import json
import math
import pandas as pd
import numpy as np
from itertools import chain
#plot
import matplotlib
import matplotlib.pyplot as plt
#feature engineering
import random
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
#modelling 
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Conv1D, Dense, Dropout, LayerNormalization
from sklearn.metrics import f1_score

# Load data

In [2]:
with open('./data/train.json', 'r') as f:
    train_df = pd.read_json(f)
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.head(5)

Unnamed: 0,authors,year,abstract,venue,title
0,"[19554, 5040, 12642, 2345, 55]",11,"[37, 1917, 33, 2448, 11, 1650, 1692, 1543, 293...",42,"[1746, 1541, 2245, 44, 47, 1730, 1710, 1708, 4..."
1,"[5095, 15585]",7,"[40, 1542, 1691, 2380, 1529, 3353, 2072, 1650,...",10,"[2123, 1548, 4094, 53, 34, 11, 1539, 4372, 153..."
2,"[2471, 1444, 18558]",6,"[1731, 1669, 2410, 3148, 1542, 3797, 33, 4943,...",9,"[1594, 1659, 3084, 1553, 1837, 11, 2086, 1669,..."
3,"[15117, 9385, 12354, 14377]",10,"[1716, 1528, 51, 1892, 2229, 3002, 1547, 1542,...",68,"[2174, 1584, 1854, 1875, 47, 1603, 51, 1525, 1..."
4,"[12591, 19998, 2353, 18117, 13073]",15,"[1999, 46, 1661, 1839, 1635, 1751, 1565, 1655,...",68,"[1731, 46, 1617, 3076, 1543, 1875, 11, 1551, 4..."


In [3]:
with open('./data/test.json', 'r') as f:
    test_df = pd.read_json(f)
test_df.head(5)

Unnamed: 0,identifier,coauthors,year,abstract,venue,title
0,0,"[16336, 1762, 4357, 12564]",19,"[37, 1662, 3207, 10, 33, 2037, 1738, 1642, 155...",223.0,"[3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,..."
1,1,"[21189, 14088]",19,"[1731, 2130, 3674, 1705, 1656, 3077, 1546, 367...",223.0,"[40, 1560, 1536, 1544, 1609, 1705, 1658, 1543,..."
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",19,"[1551, 1728, 3920, 1542, 1535, 1656, 1543, 153...",7.0,"[47, 1574, 1729, 1641, 11, 37, 2533, 2015, 47,..."
3,3,"[19810, 15173, 5876, 111]",19,"[51, 1535, 2115, 1543, 1811, 1700, 1657, 1684,...",21.0,"[1770, 53, 2054, 1549, 1529, 1723, 2796, 1547,..."
4,4,"[10932, 7668, 11907, 19601, 15307, 10492, 1049...",19,"[1775, 1746, 1842, 1525, 33, 2551, 1882, 1542,...",,"[18, 1924, 23, 1544, 3927, 2686, 1543, 1535, 1..."


# Preprocessing data

To catch actual meaning of texts and writing style of every author
1. Term frequency — Inverse document frequency (TFIDF) 
2. extracts the relationship between words (by Part of speech tagging)

Since title and abstract was presented as a set of integer, it cannot reveal enough information. For example, it does not help remove useless words and tokenize, and even find word relationship using 'part of speech' tagging comparative with literate wording.

In [4]:
X_train_features = dict()
X_test_features = dict()

## Label - one hot encoding

In [5]:
def labelProlificAuthor(author): # hand-code
    profilic_ids = np.zeros(101)
    ids = [idx for idx in author if idx < 100]
    if ids == [] :
        profilic_ids[-1]=1
    else:
        profilic_ids[ids]=1
    return profilic_ids
y_train_label_hh = np.vstack(train_df['authors'].apply(labelProlificAuthor).tolist()) # high-hierarchical: (n_samples,n_feartues)

In [6]:
split_mutli_label_idx = []
y_train_label = []
for i, authors in enumerate(train_df['authors']):
    for a in authors:
        if a < 100:
            split_mutli_label_idx.append(i) 
            y_train_label.append(a) #labels :(n_samples,)

## Feature Engineering

### Co_author Vectorization

In [7]:
train_df['authors'].apply(lambda x:len(x)).describe()

count    25793.000000
mean         3.231109
std          2.481487
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max        114.000000
Name: authors, dtype: float64

In [8]:
# in-balanced dataset
labeled_idx = [i for i,e in enumerate(y_train_label_hh) if e[-1]!=1]
unlabeled_idx = [i for i in range(y_train_label_hh.shape[0]) if i not in labeled_idx]
print("The number of instances with Labeled prolific authors is {}, otherwise, the num of non-prolific info is {}.".format(len(labeled_idx), len(unlabeled_idx)))

The number of instances with Labeled prolific authors is 7460, otherwise, the num of non-prolific info is 18333.


In [9]:
ex_co_authors = list(set(chain(*train_df['authors'].loc[labeled_idx].apply(lambda x:[i for i in x if i>=100]).values.tolist())))
print("{} authors having collaboration relationship before with prolific authors, have been recorded among train dataset.".format(len(ex_co_authors)))

6575 authors having collaboration relationship before with prolific authors, have been recorded among train dataset.


In [10]:
author_network = dict()
for instance in train_df['authors'].loc[labeled_idx]:
    for a in instance:
        if a < 100:
            if a in author_network.keys():
                author_network[a] += [i for i in instance if i >= 100]
            else:
                author_network[a] = [i for i in instance if i >= 100]
                
            author_network[a] = list(set(author_network[a]))
#len(set(chain(*list(author_network.values()))))# : 6575
#train_df['authors'].loc[labeled_idx].apply(lambda x:[i for i in x if i>=100])

In [11]:
# if we conclude all co_author[100,21246] by simply using one-hot encoding, the dimension of features would too vast 

def coAuthor(author,author_network):
    """
    We might assume that it will be more likely to see people co-work again if they collaborated before, 
    compatative with those who has no relationship with each other.
    While the instances which are not labeled by prolific authors will be filtered since it won't give useful info.
    :param authors - author column
    :param author_network - a set of co-author that worked together before
    """   
    co_authors_reindexed = list(set(chain(*list(author_network.values()))))
    max_len = len(co_authors_reindexed)
    co_author_vec = np.zeros(max_len+1)

    for i in author:
        if i >= 100:
            if i in co_authors_reindexed:
                co_author_vec[co_authors_reindexed.index(i)] = 1
            else:
                co_author_vec[-1] = 1
#             co_author_vec[i] = 1
        else:
            for j in author_network[i]:
                if co_author_vec[co_authors_reindexed.index(j)] == 0 :
                    # try to add information about network with all collaborated authors
                    co_author_vec[co_authors_reindexed.index(j)] = 0.2
    if 1 not in Counter(co_author_vec):
        co_author_vec[-1] = 1
    return co_author_vec

def coAuthorVec(co_author,author_network):
    
    co_authors_reindexed = list(set(chain(*list(author_network.values()))))
    max_len = len(co_authors_reindexed)
    co_author_vec = np.zeros(max_len+1)
    for i in co_author:
        if i in co_authors_reindexed:
            co_author_vec[co_authors_reindexed.index(i)] = 1
        else:
            co_author_vec[-1] = 1
#         co_author_vec[i] = 1
    return co_author_vec
X_train_features['co_author'] = np.vstack(train_df['authors'].apply(lambda x: coAuthor(x,author_network)).tolist())
X_test_features['co_author'] = np.vstack(test_df['coauthors'].apply(lambda x: coAuthorVec(x,author_network)).tolist())

## Text Vectorization -- TFIDF vectorizer with PCA

In [12]:
train_df['abstract'].apply(lambda x:len(x)).describe()

count    25793.000000
mean       161.643934
std         95.190936
min         12.000000
25%        114.000000
50%        148.000000
75%        188.000000
max       2804.000000
Name: abstract, dtype: float64

In [13]:
train_df['title'].apply(lambda x:len(x)).describe()

count    25793.000000
mean        20.226418
std          8.079090
min          1.000000
25%         15.000000
50%         19.000000
75%         24.000000
max        127.000000
Name: title, dtype: float64

In [14]:
#padding will not affect the similarity between two points
# def padding(df,column):
#     max_len = df[column].apply(lambda x:len(x)).max() + 1
#     train_df[column+'_pad'] = [np.append(np.array(i),np.zeros(max_len-len(i),dtype=int)) for i in df[column]]
# padding(train_df,'title')

In [15]:
train_df['text'] = train_df['title']+train_df['abstract']
test_df['text'] = test_df['title']+test_df['abstract']

In [16]:
def tfidf(df, column):
    
    documents = df[column]
    N = df.shape[0]
    tfidf_array = np.zeros((N,4999))

    for i,doc in enumerate(documents):
        for idx,freq in Counter(df[column][i]).most_common():
            tfidf_array[i][idx-1] = freq
    tfidf_df = pd.DataFrame(tfidf_array,columns=[i for i in range(1,5000)])
    
    df_sum_by_row = tfidf_df.sum(axis=1)
    tfidf_df= tfidf_df.div(df_sum_by_row, axis = 'rows')
    
    idf = np.ones(4999)
    for doc in documents:
        for i in np.unique(doc):
            idf[i-1] += 1
    tfidf_df.loc[len(tfidf_df.index)] = np.log((N+1)/idf)

    for i, row in tfidf_df.iterrows():
        tfidf_df.loc[i] = tfidf_df.iloc[-1] * row
    tfidf_df.fillna(0.0, inplace= True)
    return tfidf_df.drop(N)

In [17]:
#train_text_tfidf = tfidf(train_df, 'text')
#test_text_tfidf = tfidf(test_df, 'text')
vectorizer = TfidfVectorizer() #sparse=False
train_text_tfidf = vectorizer.fit_transform(train_df['text'].apply(lambda x: str(x)))
train_text_tfidf.shape

(25793, 4913)

In [18]:
test_text_tfidf = vectorizer.transform(test_df['text'].apply(lambda x: str(x)))
test_text_tfidf.shape

(800, 4913)

In [19]:
X_train_features['text_vec_tfidf'] = train_text_tfidf
X_test_features['text_vec_tfidf'] = test_text_tfidf

In [None]:
# text_tfidf_pca = PCA(n_components= 256)
# text_tfidf_pca.fit(train_text_tfidf)

In [None]:
svd = TruncatedSVD(n_components=256)
svd.fit(train_text_tfidf)
train_text_vec = svd.transform(train_text_tfidf)
test_text_vec = svd.transform(test_text_tfidf)

#### Doc2Vec Vectorizer

In [20]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_df['text'].apply(lambda x: str(x)))]
d2v_model = Doc2Vec(documents, vector_size=256, window=2, min_count=1, workers=4)

In [21]:
def conv2strVec(text):
    return d2v_model.infer_vector([str(int) for int in text]).T
train_text_d2v = train_df['text'].apply(conv2strVec)
test_text_d2v = test_df['text'].apply(conv2strVec)

In [22]:
np.vstack(train_text_d2v.values.tolist()).shape

(25793, 256)

In [23]:
X_train_features['text_vec_d2v'] = np.vstack(train_text_d2v.values.tolist())
X_test_features['text_vec_d2v'] = np.vstack(test_text_d2v.values.tolist())

## Venue Vectorization

In [24]:
# discrete data representation
def onehotVenue(venue): # hand-code
    venue_vec = np.zeros(466)
    if venue == '':
        venue_vec[-1] = 1
    else:
        venue_vec[venue] = 1
    return venue_vec

In [25]:
X_train_features['venue_vec'] = np.vstack(train_df['venue'].apply(onehotVenue).tolist())
X_test_features['venue_vec'] = np.vstack(test_df['venue'].apply(onehotVenue).tolist())

## Concatenate All features

In [26]:
#if d2v:
X_train_features = np.hstack([X_train_features[key] for key in ['co_author','text_vec_d2v','venue_vec']])
X_test_features = np.hstack([X_test_features[key] for key in ['co_author','text_vec_d2v','venue_vec']])
#else:
# X_train_features = np.hstack([X_train_features[key] for key in ['co_author','text_vec_tfidf','venue_vec']])
# X_test_features = np.hstack([X_test_features[key] for key in ['co_author','text_vec_tfidf','venue_vec']])

In [27]:
X_train_features.shape

(25793, 7298)

In [28]:
X_test_features.shape

(800, 7298)

In [29]:
scaler = StandardScaler()
scaler.fit(X_train_features)
X_train_scaled = scaler.transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [30]:
X_train_scaled_pca = PCA(n_components= 1000)
X_train_dim_re = X_train_scaled_pca.fit_transform(X_train_scaled)
X_test_dim_re = X_train_scaled_pca.transform(X_test_scaled)

# Classification Methods

## Build model

### MLPClassifier -- Multi-label

##### Semi-supervised machine learning to predict unlabeled datapoints with high confidence
Since many of papers don't include any prolific authors, 
1. FIRST train model using labeled instances
2. predict authorship based on prolific authors muted instances
3. Then re-train model using combined dataset 
4. predict test dataset
5. evaluate

In [31]:
# split dataset
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_train_dim_re[labeled_idx],y_train_label_hh[labeled_idx], test_size= .3, random_state = 54)

In [32]:
ssm_mlp_clf = MLPClassifier(hidden_layer_sizes=(1001,),
                    activation='relu',
                    solver='adam',
                    learning_rate='invscaling',
                    learning_rate_init=.001, 
                    batch_size=32,
                    shuffle=True,
                    random_state=54, 
                    max_iter=300,
                    early_stopping=True)
ssm_mlp_clf.fit(X_train_sub, y_train_sub)

MLPClassifier(batch_size=32, early_stopping=True, hidden_layer_sizes=(1001,),
              learning_rate='invscaling', max_iter=300, random_state=54)

In [33]:
f1_score(y_true= y_train_sub, y_pred=ssm_mlp_clf.predict(X_train_sub), average='samples')

0.9996808374824462

In [34]:
f1_score(y_true= y_test_sub, y_pred=ssm_mlp_clf.predict(X_test_sub), average='samples')

0.9952005049860277

In [35]:
sample_unlabeled = random.sample(unlabeled_idx,math.ceil(X_train_scaled.shape[0]*.75-len(labeled_idx)))
mlp_pred = ssm_mlp_clf.predict(X_train_dim_re[sample_unlabeled])

In [36]:
rest_unlabeled = list(set(unlabeled_idx).difference(set(sample_unlabeled)))

In [37]:
X_train_mlp_aug = np.vstack([X_train_dim_re[labeled_idx],X_train_dim_re[sample_unlabeled],X_train_dim_re[rest_unlabeled ]])
y_train_mlp_aug = np.vstack([y_train_label_hh[labeled_idx],mlp_pred,y_train_label_hh[rest_unlabeled ]])

In [38]:
shuffle_idx = random.shuffle([i for i in range(X_train_mlp_aug.shape[0])])
shuffle_idx

In [39]:
X_train_mlp_aug[shuffle_idx][0].shape

(25793, 1000)

In [40]:
X_train_ssm_aug, X_test_ssm_aug, y_train_ssm_aug, y_test_ssm_aug = train_test_split(X_train_mlp_aug[shuffle_idx][0],y_train_mlp_aug[shuffle_idx][0],train_size=.7, test_size= .3, random_state = 54)

In [41]:
ssm_mlp_clf.fit(X_train_ssm_aug, y_train_ssm_aug)

MLPClassifier(batch_size=32, early_stopping=True, hidden_layer_sizes=(1001,),
              learning_rate='invscaling', max_iter=300, random_state=54)

In [42]:
f1_score(y_true= y_train_ssm_aug, y_pred=ssm_mlp_clf.predict(X_train_ssm_aug), average='samples')

  average, "true nor predicted", 'F-score is', len(true_sum)


0.32660570113907134

In [43]:
f1_score(y_true= y_test_ssm_aug, y_pred=ssm_mlp_clf.predict(X_test_ssm_aug), average='samples')

  average, "true nor predicted", 'F-score is', len(true_sum)


0.3039715045013572

##### Data augmentation by dupilate same points to balance training dataset

In [44]:
duplicate_rate = math.ceil((len(unlabeled_idx) /.25 - len(labeled_idx + unlabeled_idx) )/len(labeled_idx))

In [45]:
dup_idx = labeled_idx *7+unlabeled_idx
random.shuffle(dup_idx)

In [46]:
y_train_label_hh[dup_idx].shape

(70553, 101)

In [47]:
X_train_dup, X_test_dup, y_train_dup, y_test_dup = train_test_split(X_train_dim_re[dup_idx],y_train_label_hh[dup_idx], test_size= .3, random_state = 54)

In [48]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(1001,),
                    activation='relu',
                    solver='adam',
                    learning_rate='invscaling',
                    learning_rate_init=.001, 
                    batch_size=32,
                    shuffle=True,
                    random_state=54, 
                    max_iter=300,
                    early_stopping=True)
mlp_clf.fit(X_train_dup, y_train_dup)

MLPClassifier(batch_size=32, early_stopping=True, hidden_layer_sizes=(1001,),
              learning_rate='invscaling', max_iter=300, random_state=54)

In [49]:
f1_score(y_true= y_train_dup, y_pred=mlp_clf.predict(X_train_dup), average='samples')

0.9996827775190501

In [50]:
f1_score(y_true= y_test_dup, y_pred=mlp_clf.predict(X_test_dup), average='samples')

0.9992440706793915

In [51]:
pred_final = mlp_clf.predict(X_test_dim_re)

### SVMClassifier ( 'one-vs-one' ) -- Multi-class 

### SVD

In [None]:
final_svd = TruncatedSVD(n_components=500)
final_svd.fit(X_train_scaled)
X_train_reduced = final_svd.transform(X_train_scaled)
X_test_reduced  = final_svd.transform(X_test_scaled)

In [None]:
svm_dup_rate = math.ceil((len(unlabeled_idx) /.25 - len(split_mutli_label_idx + unlabeled_idx) )/len(split_mutli_label_idx))

In [None]:
X_train_reduced[split_mutli_label_idx+unlabeled_idx].shape

In [None]:
svm_dup_idx = [i for i in range(len(y_train_label))]*svm_dup_rate + unlabeled_idx
random.shuffle(svm_dup_idx)

In [None]:
X_train_svm_dup = X_train_reduced[split_mutli_label_idx+unlabeled_idx][svm_dup_idx]
y_train_svm_dup = np.ravel((np.array(y_train_label+[-1]*len(unlabeled_idx))[svm_dup_idx]).reshape(-1,1).T)

In [None]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_train_svm_dup,y_train_svm_dup, test_size= .3, random_state = 54)

In [None]:
svm_param_grid = {'C': [.001, .1, 1, 10], #0.1, 1, 10, 100
              'kernel': ['rbf']} 
svm_model = GridSearchCV(SVC(probability=True,
                             decision_function_shape='ovr',
                             class_weight = 'balanced',
                             degree=3, 
                             random_state= 54),
                         svm_param_grid, cv=5)
  
svm_model.fit(X_train_svm, y_train_svm)

In [None]:
# View the accuracy score
print('Best score for training data:', svm_model.best_score_,"\n") 

# View the best parameters for the model found using grid search
print('Best C:',svm_model.best_estimator_.C,"\n") 
print('Best Kernel:',svm_model.best_estimator_.kernel,"\n")
print('Best Gamma:',svm_model.best_estimator_.gamma,"\n")

final_svm_model = svm_model.best_estimator_

In [None]:
train_svm_pred = final_svm_model.predict_proba(X_train_svm)

In [None]:
train_svm_pred_m = final_svm_model.predict(X_train_svm)

In [None]:
f1_score(y_train_svm,train_svm_pred_m,average="micro")

In [None]:
f1_score(y_test_svm,final_svm_model.predict(X_test_svm),average="micro")

In [None]:
svm_pred = final_svm_model.predict(final_svd.transform(X_test_scaled))

## Save prediction

In [52]:
def scal(p):
    if list(np.unique(p)) == [0]:
        return -1
    elif p[-1]==1:
        return -1
    else:
        return list(np.where(p[:-1]==1)[0]) 
pred = [scal(i) for i in pred_final]

In [53]:
output = {'Id': [i for i in range(X_test_features.shape[0])],'Predict':pred}

In [54]:
pred_df = pd.DataFrame(output,index= None)

In [56]:
pred_df.to_csv('predictions.csv')