In [81]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [82]:
df_data = pd.read_csv('data/phishing_data/dataset_phishing.csv')
df_data.shape

(11430, 89)

In [83]:
df_data['status']

0        legitimate
1          phishing
2          phishing
3        legitimate
4        legitimate
            ...    
11425    legitimate
11426      phishing
11427    legitimate
11428    legitimate
11429      phishing
Name: status, Length: 11430, dtype: object

In [84]:
# Encoding 'status', as label 1 & 0
df_data['target'] = pd.get_dummies(df_data['status'])['legitimate'].astype('int')
df_data.drop('status', axis=1, inplace = True)
df_data[['url','target']].head()

Unnamed: 0,url,target
0,http://www.crestonwood.com/router.php,1
1,http://shadetreetechnology.com/V4/validation/a...,0
2,https://support-appleld.com.secureupdate.duila...,0
3,http://rgipt.ac.in,1
4,http://www.iracing.com/tracks/gateway-motorspo...,1


In [85]:
from sklearn.model_selection import train_test_split

x = df_data.iloc[:,1:-1]
y = df_data['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, stratify=y)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(8572, 87) (2858, 87)
(8572,) (2858,)


In [86]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_tensor = torch.from_numpy(scaler.transform(x_train)).float().to(device)
x_test_tensor = torch.from_numpy(scaler.transform(x_test)).float().to(device)
y_train_tensor = torch.from_numpy(y_train.values).float().to(device) # y_train 은 시리즈 형태인데 value형태로 가져오면 numpy로 가져오게  
y_train_tensor = y_train_tensor.unsqueeze(1) # 열벡터를 행렬로 변환, 256 87 을 256 1 로 결과를 출력하도록 함
y_test_tensor = torch.from_numpy(y_test.values).float().to(device)
y_test_tensor = y_test_tensor.unsqueeze(1)

In [87]:
nb_epochs = 1000
minibatch_size = 256

In [88]:
class FunModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.linear_layers = nn.Sequential (
            nn.Linear(input_dim, 200),
            nn.LeakyReLU(0,1),
            nn.Linear(200,100),
            nn.LeakyReLU(0,1),
            nn.Linear(100, 20),
            nn.LeakyReLU(0,1),
            nn.Linear(20, 5),
            nn.LeakyReLU(0,1),
            nn.Linear(5, output_dim),
            nn.Sigmoid()
        ).to(device)

    def forward(self, x):
        y = self.linear_layers(x)
        return y

In [89]:
input_dim = x_train_tensor.size(-1)
output_dim = y_train_tensor.size(-1)
print(input_dim, output_dim)
model = FunModel(input_dim, output_dim)
loss_func = nn.BCELoss() # 반드시 BCELoss() 사용을 위해 마지막은 sigmoid() 함수를 사용해야 한다.
optimizer = torch.optim.Adam(model.parameters())

87 1


In [90]:
start_time = time.time()

for index in range(nb_epochs):
    indices = torch.randperm(x_train_tensor.size(0), device=device)

    x_batch_list = torch.index_select(x_train_tensor, 0 , index = indices)
    y_batch_list = torch.index_select(y_train_tensor, 0 , index = indices)
    x_batch_list = x_batch_list.split(minibatch_size, 0)
    y_batch_list = y_batch_list.split(minibatch_size, 0)

    epoch_loss = list()
    for x_minibatch, y_minibatch in zip(x_batch_list, y_batch_list):
        x_minibatch = x_minibatch.to(device)
        y_minibatch = y_minibatch.to(device)

        y_minibatch_pred = model(x_minibatch)

        loss = loss_func(y_minibatch_pred, y_minibatch)
        epoch_loss.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (index % 100) == 0:
        print(index, sum(epoch_loss) / len(epoch_loss))

end_time = time.time()

print(f"\n전체 실행 시간: {end_time - start_time:.2f}초")


0 tensor(0.5852, device='cuda:0', grad_fn=<DivBackward0>)
100 tensor(0.0812, device='cuda:0', grad_fn=<DivBackward0>)
200 tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)
300 tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)
400 tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)
500 tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)
600 tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)
700 tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)
800 tensor(0.0804, device='cuda:0', grad_fn=<DivBackward0>)
900 tensor(0.0230, device='cuda:0', grad_fn=<DivBackward0>)

전체 실행 시간: 53.86초


In [91]:
y_pred_list = []
model.eval()
with torch.no_grad():
    y_test_pred_sigmoid = model(x_test_tensor)
    y_test_pred = torch.round(y_test_pred_sigmoid) # torch.round(x)

In [122]:
# mini batch isze 기반 예측
y_pred_list = list()
x_test_batch_list = x_test_tensor.split(minibatch_size, 0)
model.eval()
with torch.no_grad():
    for x_minibatch in x_test_batch_list:
        y_test_pred_sigmoid = model(x_minibatch)
        y_test_pred = torch.round(y_test_pred_sigmoid)
        y_pred_list.extend(y_test_pred.squeeze().detach().tolist())

y_pred_list = torch.tensor(y_pred_list).unsqueeze(1)

In [123]:
print(y_pred_list.shape, y_test_tensor.shape)

torch.Size([2858, 1]) torch.Size([2858, 1])


In [126]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
# gpu위에 올라와 있으면 자동으로 numpy로 변환이 안됨. cpu로 올리고 numpy로 수동으로 바꿔줌
y_test_tensor = y_test_tensor.cpu().numpy()
y_pred_list = y_pred_list.cpu().numpy()


print("Confusion Matrix\n", str(confusion_matrix(y_test_tensor, y_pred_list)))
print("Precision:\t"+str(precision_score(y_test_tensor, y_pred_list)))
print("Recall:\t"+str(recall_score(y_test_tensor, y_pred_list)))
print("F1 Score:\t"+str(f1_score(y_test_tensor, y_pred_list)))

Confusion Matrix
 [[1366   63]
 [  62 1367]]
Precision:	0.955944055944056
Recall:	0.9566130160951715
F1 Score:	0.956278419027632
