In [18]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
class FTTransformer(nn.Module):
    def __init__(self, num_features, d_model, num_classes, num_layers, num_heads):
        super(FTTransformer, self).__init__()
        self.d_model = d_model
        
        self.feature_embedding = nn.Linear(num_features, d_model)
        
        self.positional_embedding = nn.Parameter(
            torch.zeros(1, 1, d_model)
        )
        
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads),
            num_layers=num_layers
        )
        
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        # x shape: [batch_size, num_features]
    
        # [batch_size, num_features, d_model]
        x = self.feature_embedding(x).unsqueeze(1)
        
        # Positional embedding
        if self.positional_embedding.size(1) != x.size(1) or self.positional_embedding.size(2) != x.size(2):
            self.positional_embedding = nn.Parameter(
                torch.zeros(1, x.size(1), x.size(2)).to(x.device)
            )
        
        x = x + self.positional_embedding 
        x = x.permute(1, 0, 2)
        x = self.transformer_encoder(x) 
        x = x.mean(dim=0) 
        logits = self.classifier(x) 
        return logits


In [90]:
class TabularDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [None]:
df = pd.read_csv('preprocessed_final_url.csv')
# 'url' 컬럼 제외, 'result' 컬럼을 타겟으로 사용
y = df['result']
X = df.drop(columns=['url', 'result'])

# 정규화
scaler = StandardScaler()
num_cols = ['created_year','created_month','created_day','created_hour','url_length','hostname_length',
            'special_char_sum','common_term_sum','digit_ratio','num_subdomains']
X[num_cols] = scaler.fit_transform(X[num_cols])

X = X.values
y = y.values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)

train_dataset = TabularDataset(X_train, y_train)
val_dataset = TabularDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
num_features = X_train.shape[1]
num_classes = len(set(y))
d_model = 128 
num_layers = 4    
num_heads = 8     

model = FTTransformer(num_features=num_features, d_model=d_model, num_classes=num_classes, 
                      num_layers=num_layers, num_heads=num_heads)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)



In [None]:
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for features, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}")

Epoch [1/10], Loss: 0.2379
Epoch [2/10], Loss: 0.1481
Epoch [3/10], Loss: 0.1246
Epoch [4/10], Loss: 0.1031
Epoch [5/10], Loss: 0.0957
Epoch [6/10], Loss: 0.0822
Epoch [7/10], Loss: 0.0802
Epoch [8/10], Loss: 0.0716
Epoch [9/10], Loss: 0.0719
Epoch [10/10], Loss: 0.0636


In [99]:
torch.save(model.state_dict(), 'ft_transformer_model.pth')

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for features, targets in val_loader:
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9774
