In [1]:
import copy
import json
import csv
import random
from time import time
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, ConcatDataset, DataLoader
from sklearn.utils import shuffle
from sklearn import metrics

In [2]:
begin_time = time()

# Read file

In [3]:
def load_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        data = json.load(file)
    return data


train_file_path = "Data/train.json"
train_dataset = load_file(train_file_path)

test_file_path = "Data/test.json"
test_dataset = load_file(test_file_path)

train_dataset_list = []
for value in train_dataset.values():
    train_dataset_list.append(value)

test_dataset_list = []
for value in test_dataset.values():
    test_dataset_list.append(value)

In [4]:
venue_number = 471
keywords_number = 500

author_max = -1
author_min = 100000
year_earlier = 3000
year_latest = 0
for value in train_dataset.values():
    for author in value["author"]:
        if author_max < author:
            author_max = author
        if author_min > author:
            author_min = author

    year = value["year"]
    if year_earlier > year:
        year_earlier = year
    if year_latest < year:
        year_latest = year

year_number = year_latest - year_earlier + 1
author_number = author_max - author_min + 1


# Data Processing

## Onehot + TFIDF

In [5]:
def get_dataset(dataset_list):
    X = []
    Y = []
    for instance in dataset_list:
        authors = instance["author"]
        for targe_author in authors:
            dic = copy.deepcopy(instance)
            aut = copy.deepcopy(authors)
            aut.remove(targe_author)
            dic["author"] = aut
            X.append(dic)
            Y.append(targe_author)
    return X, Y

In [6]:
class MyTrainData(Dataset):
    def __init__(self, dataset_list):
        self.X, self.Y = get_dataset(dataset_list)
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
    def __len__(self):
        return len(self.X)


class MyTestData(Dataset):
    def __init__(self, dataset_list):
        self.X = []
        self.Y = []
        for data in dataset_list:
            self.X.append({
                "venue": data["venue"],
                "keywords": data["keywords"],
                "year": data["year"],
                "author": data["coauthor"]
            })
            self.Y.append(data["target"])

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

    def __len__(self):
        return len(self.X)

In [7]:
train_dataset_mydata = MyTrainData(train_dataset_list)
test_dataset_mydata = MyTestData(test_dataset_list)

In [8]:
tf = open("Data/TFIDF_ALL.json", "r")
word_bag_all = json.load(tf)

In [9]:
def collate_function(dataset_tuple):
    venues = []
    keywords = []
    authors = []
    Y = []

    for x, y in dataset_tuple:
        
        venue_one_hot = [0 for i in range(venue_number)]
        venue = x["venue"]
        if venue != "":
            venue_one_hot[venue] = 1
        else:
            venue_one_hot[venue_number - 1] = 1
        venues.append(venue_one_hot)

        # keywords
        keywords_data = x["keywords"]
        keywords_tfidf = [0 for i in range(keywords_number)]
        for k in keywords_data:
            word_bag = word_bag_all
            if word_bag.get(str(k)) is None:
                keywords_tfidf[k] = 0
            else:
                keywords_tfidf[k] = word_bag.get(str(k)) * 50
        keywords.append(keywords_tfidf)


        # author
        author_one_hot = [0 for i in range(author_number)]
        if x["author"]:
            for author in x["author"]:
                author_one_hot[author] = 1
        authors.append(author_one_hot)
        
        Y.append(y)

    X_output = []
    for i in range(len(venues)):
        X_output.append(venues[i] + keywords[i] + authors[i])
    
    result = {
        "X": torch.FloatTensor(X_output),
        "labels": torch.LongTensor(Y)
    }
    

    return result

In [10]:
train_dataloader = DataLoader(train_dataset_mydata, batch_size=16, shuffle=True, collate_fn=collate_function)
test_dataloader = DataLoader(test_dataset_mydata, batch_size=16, collate_fn=collate_function)

# Model

In [11]:
class MyClassifier(nn.Module):

    def __init__(self, input_dim):
        super().__init__()
        self.module = nn.Sequential(
            nn.Linear(input_dim, keywords_number+author_number),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(keywords_number+author_number, keywords_number+author_number - 256),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(keywords_number+author_number - 256, author_number),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, data_batch):
        
        result = self.module(data_batch["X"])

        return result

# Train

## Model

In [12]:
clf = MyClassifier(input_dim=venue_number+keywords_number+author_number)
if torch.cuda.is_available():
    clf = clf.cuda()

## Loss Function

In [13]:
loss_function = nn.NLLLoss()
if torch.cuda.is_available():
    loss_function = loss_function.cuda()

## optimizer

In [14]:
learning_rate = 1e-3
optimizer = torch.optim.Adam(clf.parameters(), lr=learning_rate)

## Start Training

In [None]:
epoch = 20

for i in range(epoch):
    print("------ Epoch {} start ------".format(i + 1))
    for data in train_dataloader:
        if torch.cuda.is_available():
            gpu_batch = {'X': data['X'].cuda()}
            gpu_batch['labels'] = data['labels'].cuda()
            data = gpu_batch
        outputs = clf(data)
        loss = loss_function(outputs, data["labels"])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
            
torch.save(clf.state_dict(), f"NN.mdl")

## Predistions

In [None]:
predicts = []
with torch.no_grad():
    for data in test_dataloader:
        log_pre = clf(data)
        pre = torch.exp(log_pre)
        for i in range(len(pre)):
            predicts.append(pre[i][data['labels'][i]].detach().squeeze().numpy().tolist())

In [None]:
end_time = time()
run_time = end_time-begin_time
print(run_time)

# Save

In [None]:
f = open('Predictions/predict-NN-multi-10.csv','w',encoding='utf-8')
csv_writer = csv.writer(f)
csv_writer.writerow(["ID","Predicted"])
for i in range(len(predicts)):
    csv_writer.writerow([i, predicts[i]])

f.close()