# MLG HW2

In [2]:
import pandas as pd
import numpy as np
import os, time, torch, json
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch_geometric.utils import accuracy,sparse_mx_to_torch_sparse_tensor
import torch_geometric.utils 
# from models.GCN import GCN
import scipy.sparse as sp
from tqdm import tqdm, trange
from torch.autograd import Variable
import torchvision
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import torch.utils.data as Data
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.stats import entropy

# test 
from sklearn.metrics import roc_auc_score, average_precision_score


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
content = []
test = []
train = []
upload = []
edge_index = [] 
for i in range(3):
    os.chdir('/home/rita/111/111-2MLG/HW2/dataset{}'.format(i + 1))
    temp = pd.read_csv('./content.csv', header = None, sep = '\t')
    temp.sort_values(by = [0], inplace = True)
    temp.set_index([0], inplace = True)
    temp = torch.Tensor(np.array(temp)).to(torch.float32)
    content.append(temp)
    test.append(pd.read_csv('./test.csv'))
    temp = pd.read_csv('./train.csv')
    train.append(temp)
    temp = temp[temp.label == 1]
    temp = temp[['to', 'from']]
    temp = temp.reset_index(drop = True)
    edge_index.append(temp)
    upload.append(pd.read_csv('./upload.csv'))
print(content[0][:2])
print(test[0].head(2))
print(train[0].head(2))
print(edge_index[0].head(2))
print(upload[0].head(2))
os.chdir('/home/rita/111/111-2MLG/HW2')

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
       id    to  from
0  E10559  2323  2673
1   E4849    81  1634
       id    to  from  label
0  E10311  2399  2339      0
1  E10255  2397  1144      1
     to  from
0  2397  1144
1  2450  1312
       id  prob
0  E10559   0.5
1   E4849   0.5


In [7]:
# content preprocessing
new_features = []
for i in range(len(content)) :
    t = content[i]
    features_entropy = []
    for i in range(t.shape[1]) :
        temp = t.T[i]
        t1 = torch.sum(temp == 0) / len(temp)
        t2 = torch.sum(temp == 1) / len(temp)
        temp = torch.tensor([t1, t2])
        temp = entropy(temp)
        if (temp == 0) :
            temp = 0
        else :
            temp = 1 / temp
        features_entropy.append(temp)
    features_entropy = torch.tensor(features_entropy).reshape(1, -1)
    t = t * features_entropy
    t = t.type(torch.float32)
    new_features.append(t)
print(new_features[0][:2])

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [8]:
class link_predict(nn.Module) :
    def __init__(self, features, emb_dim = 128) :
        super(link_predict, self).__init__()
        self.features = features
        self.edge_index = edge_index
        self.emb_dim = emb_dim
        self.mlp = nn.Sequential(
            nn.Linear(self.features.shape[1], self.emb_dim), 
            nn.ReLU(), 
            nn.Linear(self.emb_dim, self.emb_dim // 2), 
            # nn.ReLU(), 
            # nn.Linear(self.emb_dim // 2, self.emb_dim // 4)
        )
        # self.poten_edges = self.get_poten_edges(fearures)
        
    def forward(self, want_edge) :
        z = self.mlp(self.features)
        out = []
        for i in range(want_edge.shape[0]) :
            idx1 = want_edge[i, 0].type(torch.LongTensor)
            idx2 = want_edge[i, 1].type(torch.LongTensor)
            temp = (z[idx1] * z[idx2]).sum()
            # temp = torch.matmul(z[idx1], z[idx2].T)
            # temp = temp if temp > 0 else 0
            temp = temp - torch.mean(temp)
            sig = nn.Sigmoid()
            temp = sig(temp)
            out.append(temp)
        return torch.tensor(out).reshape(-1, 1).squeeze()

In [9]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.") #可以根據輸出結果知道是否有可用的GPU

Training on device cuda.


In [19]:
def training_loop(model, n_epochs, optimizer, loss_fn, features, train_loader, sigma, val_loader = None) :
    ls_train_loss = []
    ls_val_loss = []
    features = features.to(torch.float32).to(device = device)
    for epoch in range(1, n_epochs + 1) :  
        val_loss = 0
        loop = tqdm(enumerate(train_loader), total = len(train_loader))
        for i, (edge, labels) in loop :
            poten = get_poten_edges(features, edge)
            edge = edge.to(device)
            labels = labels.to(device)
            outputs = model(edge).to(device)            
            loss = 0
            for j in range(labels.shape[0]) :
                if(labels[j] == 1):
                    loss += torch.exp(-(poten[j]/sigma**2)) * loss_fn(outputs[j], labels[j])
                    loss /= labels.shape[0]
                else :
                    loss += torch.exp((poten[j]/sigma**2)) * loss_fn(outputs[j], labels[j])     
                    loss /= labels.shape[0]
                
            loss.requires_grad_()
            optimizer.zero_grad() 
            loss.backward() 
            optimizer.step()
            loop.set_description(f'Epoch[{epoch} / {n_epochs}]')
            if (val_loader != None):
                if (i+1 == len(train_loader)) :
                    val_loss = validate(model, loss_fn, val_loader)
                loop.set_postfix(loss = loss, val_loss = val_loss)
            else :
                loop.set_postfix(loss = loss)
        ls_train_loss.append(loss.item()) 
    
    return ls_train_loss
        
def get_poten_edges(features, edge) :
    poten_edges = []
    
    for i in range(edge.shape[0]) :
        idx1 = edge[i, 0].type(torch.LongTensor)
        idx2 = edge[i, 1].type(torch.LongTensor)
        temp = (features[idx1] - features[idx2]) ** 2
        temp = torch.sum((features[idx1] - features[idx2]) ** 2)
        # print(temp)
        # temp = (features[idx1] == features[idx2]).sum() 
        poten_edges.append(temp)
    temp -= temp.min()        
    return torch.tensor(poten_edges)    

def validate(model, loss_fn, loader):
    correct = 0
    total = 0
    loss = 0
    with torch.no_grad(): 
        for edges, labels in loader:
            poten = get_poten_edges(features, edges)
            edges = edges.to(device)
            labels = labels.to(device)
            outputs = model(edges).to(device)  
            total += labels.shape[0] 
            for i in range(labels.shape[0]) :
                if(labels[i] == 1):
                    loss += torch.exp(-(poten[i]/sigma**2)) * loss_fn(outputs[i], labels[i])
                else :
                    loss += torch.exp((poten[i]/sigma**2)) * loss_fn(outputs[i], labels[i])     
    loss /= total       
   
    return loss
    

## test

In [20]:
s = time.time()
model = link_predict(content[0].to(device)).to(device)
# L2 regularization
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay = 1e-5)  
loss_fn = nn.MSELoss()
n_epochs = 5
features = new_features[0]
train_data = train[0]
train_label = torch.Tensor(np.array(train_data.iloc[::, -1])).to(torch.float32)
train_data = torch.Tensor(np.array(train_data.iloc[::, [1, 2]])).to(torch.float32)
train_data, val_data, train_label, val_label = train_test_split(train_data, train_label, test_size=0.33, random_state=42)
train_dataset = Data.TensorDataset(train_data, train_label)
val_dataset = Data.TensorDataset(val_data, val_label)
batch_size = 64
train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = batch_size,
    shuffle = True,
    num_workers = 4
)
val_loader = DataLoader(
    dataset = val_dataset,
    batch_size = batch_size,
    shuffle = True,
    num_workers = 4
)
sigma = 100

ls_loss = training_loop( 
    model = model, 
    n_epochs = n_epochs,
    optimizer = optimizer,
    loss_fn = loss_fn,
    features = features, 
    train_loader = train_loader, 
    val_loader = val_loader, 
    # val_loader = None, 
    sigma = sigma
)
print(time.time() - s)

Epoch[1 / 5]: 100%|██████████| 91/91 [00:04<00:00, 18.88it/s, loss=tensor(0.0048, device='cuda:0', requires_grad=True), val_loss=tensor(7.2523, device='cuda:0')]
Epoch[2 / 5]: 100%|██████████| 91/91 [00:04<00:00, 19.10it/s, loss=tensor(0.0043, device='cuda:0', requires_grad=True), val_loss=tensor(7.2568, device='cuda:0')]
Epoch[3 / 5]: 100%|██████████| 91/91 [00:05<00:00, 17.72it/s, loss=tensor(0.0044, device='cuda:0', requires_grad=True), val_loss=tensor(7.2530, device='cuda:0')]
Epoch[4 / 5]: 100%|██████████| 91/91 [00:05<00:00, 17.24it/s, loss=tensor(0.0043, device='cuda:0', requires_grad=True), val_loss=tensor(7.2537, device='cuda:0')]
Epoch[5 / 5]: 100%|██████████| 91/91 [00:04<00:00, 20.00it/s, loss=tensor(0.0043, device='cuda:0', requires_grad=True), val_loss=tensor(7.2543, device='cuda:0')]

26.027231216430664





In [21]:
def save(features, train, n_epochs = 100, batch_size = 64, sigma = 100, val = True) :
    model = link_predict(features.to(device)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay = 1e-5)  
    loss_fn = nn.MSELoss()
    train_y = torch.Tensor(np.array(train.iloc[::, -1])).to(torch.float32)
    train_x = torch.Tensor(np.array(train.iloc[::, [1, 2]])).to(torch.float32)
    if val :
        train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.33, random_state=42)
        train_dataset = Data.TensorDataset(train_x, train_y)
        val_dataset = Data.TensorDataset(val_x, val_y)
        val_loader = DataLoader(
            dataset = val_dataset,
            batch_size = batch_size,
            shuffle = True,
            num_workers = 4
        )
    train_loader = DataLoader(
        dataset = train_dataset,
        batch_size = batch_size,
        shuffle = True,
        num_workers = 4
    )
    
    ls_loss = training_loop( 
        model = model, 
        n_epochs = n_epochs,
        optimizer = optimizer,
        loss_fn = loss_fn,
        features = features, 
        train_loader = train_loader, 
        val_loader = val, 
        sigma = sigma
    )
    return model, ls_loss

def predict(model, test) :
    test_x = torch.Tensor(np.array(test.iloc[::, 1:])).to(device)
    test_y = model(test_x)
    test_y = pd.DataFrame(test_y)
    pred = pd.concat([test, test_y], axis = 1)
    pred = pred.drop(['to', 'from'], axis = 1)
    pred.columns = ['id', 'prob']
    return pred

In [None]:
# train
n_epochs = 50
batch_size = 64
for i in range(3) :
    features = new_features[i]
    train_x = train[i]
    test_x = test[i]
    model = link_predict(features.to(device)).to(device)\
    model, ls_loss = save(
        features = features, 
        train = train_x, 
        n_epochs = n_epochs, 
        batch_size = batch_size, 
        sigma = 100
        val = True
    )

In [14]:
# mAUC: sklearn.metrics.roc_auc_score
# mAP: sklearn.metrics.average_precision_score
# Final = (mAUC+mAP)/2

# example1
y_true = np.array([0, 0, 1, 1])
print(y_true.shape)
y_predprob = np.array([[0.9, 0.1], [0.6, 0.4], [0.65, 0.35], [0.2, 0.8]])
y_scores = y_predprob[:, 1]
print(roc_auc_score(y_true, y_scores))
print(average_precision_score(y_true, y_scores))

# train
n_epochs = 50
batch_size = 64
for i in range(3) :
    features = new_features[i]
    train_x = train[i]
    train_y = torch.Tensor(np.array(train_x.iloc[::, -1])).to(torch.float32)
    train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.33, random_state=42)
    model = link_predict(features.to(device)).to(device)\
    model, ls_loss = save(
        features = features, 
        train = train_x, 
        n_epochs = n_epochs, 
        batch_size = batch_size, 
        sigma = 100
        val = True
    )
    pred = predict(
        model = model, 
        test = val_x
    )
    pred = pred[::, -1]
    print()


# check (mAUC+mAP)/2
for i in range(3) :
    features = new_features[i]
    test_x = test[i]
    loaded_model = link_predict(features.to(device)).to(device)
    loaded_model.load_state_dict(torch.load('./model/link_prediction_{}.pt'.format(i + 1)))
    loaded_model.to(device)
    pred = predict(
        model = loaded_model, 
        test = test_x
    )
    # pred.to_csv('./upload/pred_{}.csv'.format(i + 1))





(4,)
0.75
0.8333333333333333
0.75


## Upload File

In [110]:
loss = {}
for i in range(3) :
    features = new_features[i]
    train_x = train[i]
    test_x = test[i]
    model = link_predict(features.to(device)).to(device)
    n_epochs = 50
    batch_size = 64
    ls_loss = save(
        features = features, 
        train = train_x, 
        n_epochs = n_epochs, 
        batch_size = batch_size, 
        sigma = 100
    )
    fig = plt.figure()
    plt.title('Loss_{}'.format(i))
    temp = np.array(ls_loss)
    plt.plot(range(1, n_epochs + 1), ls_loss)
    plt.savefig('./figure/loss_{}.png'.format(i + 1))
    plt.close(fig)
    loss[i] = ls_loss
    filename = './model/link_prediction_{}.pt'.format(i + 1)
    torch.save(model.state_dict(), filename)

with open("loss.txt", "w") as fp:
    json.dump(loss, fp)

Epoch[1 / 50]: 100%|██████████| 136/136 [00:03<00:00, 34.58it/s, loss=tensor(0.5123, device='cuda:0', requires_grad=True)]
Epoch[2 / 50]: 100%|██████████| 136/136 [00:04<00:00, 33.71it/s, loss=tensor(0.1410, device='cuda:0', requires_grad=True)]
Epoch[3 / 50]: 100%|██████████| 136/136 [00:04<00:00, 33.86it/s, loss=tensor(0.0038, device='cuda:0', requires_grad=True)]  
Epoch[4 / 50]: 100%|██████████| 136/136 [00:04<00:00, 32.55it/s, loss=tensor(0.0834, device='cuda:0', requires_grad=True)]  
Epoch[5 / 50]: 100%|██████████| 136/136 [00:03<00:00, 34.45it/s, loss=tensor(0.2239, device='cuda:0', requires_grad=True)]
Epoch[6 / 50]: 100%|██████████| 136/136 [00:04<00:00, 33.50it/s, loss=tensor(0.0351, device='cuda:0', requires_grad=True)] 
Epoch[7 / 50]: 100%|██████████| 136/136 [00:04<00:00, 33.89it/s, loss=tensor(0.1833, device='cuda:0', requires_grad=True)]
Epoch[8 / 50]: 100%|██████████| 136/136 [00:03<00:00, 35.06it/s, loss=tensor(0.0107, device='cuda:0', requires_grad=True)]   
Epoch[9 

In [112]:
def predict(model, test) :
    test_x = torch.Tensor(np.array(test.iloc[::, 1:])).to(device)
    test_y = loaded_model(test_x)
    test_y = pd.DataFrame(test_y)
    pred = pd.concat([test, test_y], axis = 1)
    pred = pred.drop(['to', 'from'], axis = 1)
    pred.columns = ['id', 'prob']
    return pred
for i in range(3) :
    features = new_features[i]
    test_x = test[i]
    loaded_model = link_predict(features.to(device)).to(device)
    loaded_model.load_state_dict(torch.load('./model/link_prediction_{}.pt'.format(i + 1)))
    loaded_model.to(device)
    pred = predict(
        model = loaded_model, 
        test = test_x
    )
    pred.to_csv('./upload/pred_{}.csv'.format(i + 1))