In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import d2l
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from IPython import display
import utils

In [2]:
train_data_file_name = "train_data.csv"
test_data_file_name = "test_data.csv"

In [3]:
# read data
print("Reading files ...")
try:
    print(train_data_file_name + " ... ", end = "")
    train = pd.read_csv(train_data_file_name)
    print("finished (1/2)")
    print(test_data_file_name + " ... ", end = "")
    test = pd.read_csv(test_data_file_name)
    print("finished (2/2)")
    print("File loading completed!")
except FileNotFoundError as fnf_error:
    print(fnf_error)

Reading files ...
train_data.csv ... finished (1/2)
test_data.csv ... finished (2/2)
File loading completed!


In [4]:
print(train.shape)
print(test.shape)

(15220, 214)
(10106, 214)


In [5]:
train_total_size = 15220
training_size = 14000
validate_size = train_total_size - training_size
testing_size = 10106

training_data = torch.FloatTensor(train.iloc[: training_size, 4 :].values)
validate_data = torch.FloatTensor(train.iloc[training_size :, 4 :].values)
testing_data = torch.FloatTensor(test.iloc[:, 4 :].values)
training_label = torch.FloatTensor(train.iloc[: training_size, [2, 3]].values)
validate_label = torch.FloatTensor(train.iloc[training_size :, [2, 3]].values)
print("training_data size:", list(training_data.shape))
print("validate_data size:", list(validate_data.shape))
print("testing_data size:", list(testing_data.shape))
print("training_label size:", list(training_label.shape))
print("validate_label size:", list(validate_label.shape))

training_data size: [14000, 210]
validate_data size: [1220, 210]
testing_data size: [10106, 210]
training_label size: [14000, 2]
validate_label size: [1220, 2]


In [6]:
input_size, hidden1_size, hidden2_size, output_size = 210, 514, 128, 2
batch_size = 14000
Epchs = 651
lr = 0.001

In [7]:
class myNet(nn.Module):
    
    def __init__(self, input_size, hidden1_size, hidden2_size, output_size):        
        super(myNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden2_size, output_size)
        
    def forward(self, x):
        out = self.fc1(x)
        out.float()
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

In [8]:
net = myNet(input_size, hidden1_size, hidden2_size, output_size)

In [9]:
print(net)

myNet(
  (fc1): Linear(in_features=210, out_features=514, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=514, out_features=128, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=128, out_features=2, bias=True)
)


In [10]:
optimizer = optim.Adam(net.parameters(), lr=lr)
loss_funtion = nn.MSELoss()

In [11]:
def load_array(data_arrays, batch_size, is_train=True):
    features, labels = data_arrays
    num_examples = len(labels)
    indices = list(range(num_examples))
    random.shuffle(indices)
    array = []
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i : min(i + batch_size, num_examples - 1)])
        minibatch_features = features.index_select(0, batch_indices)
        minibatch_labels = labels.index_select(0, batch_indices)
        array.append((minibatch_features, minibatch_labels))
    train_loader = iter(array)
    return train_loader

In [12]:
for epoch in range(Epchs):
    data_iter = load_array((training_data, training_label), batch_size)
    for batch_X, batch_y in data_iter:
        net.zero_grad()
        output = net(batch_X)
        loss = loss_funtion(output, batch_y)
        loss.backward()
        optimizer.step()
    if epoch % ((Epchs - 1) / 10) == 0:
        print("loss for epoch ",epoch, " is ", loss.data)

loss for epoch  0  is  tensor(1.6748)
loss for epoch  65  is  tensor(0.0986)
loss for epoch  130  is  tensor(0.0779)
loss for epoch  195  is  tensor(0.0649)
loss for epoch  260  is  tensor(0.0579)
loss for epoch  325  is  tensor(0.0517)
loss for epoch  390  is  tensor(0.0477)
loss for epoch  455  is  tensor(0.0456)
loss for epoch  520  is  tensor(0.0413)
loss for epoch  585  is  tensor(0.0400)
loss for epoch  650  is  tensor(0.0398)


In [13]:
train_result = net(training_data)
print(train_result.shape)
print("loss for train_data is: ", loss_funtion(train_result, training_label).data)
fpr, tpr, thresholds = metrics.roc_curve(training_label.data.numpy()[:, 0], train_result.data.numpy()[:, 0], pos_label=1)
train_auc = metrics.auc(fpr, tpr)
print("AUC is: ", train_auc)

torch.Size([14000, 2])
loss for train_data is:  tensor(0.0396)
AUC is:  0.9682596709342418


In [14]:
result = net(validate_data)
print(result.shape)
print("loss for train_data is: ", loss_funtion(result, validate_label).data)
fpr, tpr, thresholds = metrics.roc_curve(validate_label.data.numpy()[:, 0], result.data.numpy()[:, 0], pos_label=1)
validate_auc = metrics.auc(fpr, tpr)
print("AUC is: ", validate_auc)

torch.Size([1220, 2])
loss for train_data is:  tensor(0.2323)
AUC is:  0.5758459627329193
