In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
from torchvision import transforms
import torch.nn.functional as F
import pywt
from scipy import stats
from multiprocessing import Pool, set_start_method
import os
# set_start_method('fork')

# 1. 数据预处理部分


def convert_pe_to_image(file_path, output_path, width=384):
    """将PE文件转换为灰度图像"""
    try:
        with open(file_path, 'rb') as f:
            content = f.read()

        # 将字节转换为uint8数组
        byte_array = np.frombuffer(content, dtype=np.uint8)

        # 计算需要的行数
        height = len(byte_array) // width + \
            (1 if len(byte_array) % width else 0)

        # 填充数组到完整的矩形
        padded_size = height * width
        if len(byte_array) < padded_size:
            byte_array = np.pad(byte_array, (0, padded_size - len(byte_array)))

        # 重塑为2D数组
        image_array = byte_array.reshape((height, width))

        # 创建并保存图像
        image = Image.fromarray(image_array)
        image.save(output_path)
        return True
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return False


def process_single_file(args):
    """处理单个PE文件的辅助函数"""
    input_path, output_path = args
    success = convert_pe_to_image(input_path, output_path)
    return success


def process_directory(input_dir, output_dir, num_processes=None):
    """使用多进程并行处理整个目录的PE文件"""
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 准备任务列表
    tasks = []
    for filename in os.listdir(input_dir):
        if filename.endswith('.exe') or filename.endswith('.dll'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f"{filename}.png")
            tasks.append((input_path, output_path))

    total_count = len(tasks)

    # 使用进程池并行处理
    with Pool(processes=num_processes) as pool:
        results = pool.map(process_single_file, tasks)

    # 统计成功数量
    success_count = sum(1 for result in results if result)

    print(f"Successfully processed {success_count}/{total_count} files")


class MalwareDetectionCNN(nn.Module):
    def __init__(self):
        super(MalwareDetectionCNN, self).__init__()
        # 增加卷积层数量
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)  # 新增卷积层
        self.pool = nn.MaxPool2d(2, 2)

        # 使用自适应平均池化将特征图转换为固定大小
        self.adaptive_pool = nn.AdaptiveAvgPool2d((6, 8))

        # 根据自适应池化后的固定输出大小计算全连接层输入
        self.fc1 = nn.Linear(256 * 6 * 8, 1024)  # 增加神经元数量
        self.fc2 = nn.Linear(1024, 512)  # 增加神经元数量
        self.fc3 = nn.Linear(512, 2)  # 新增全连接层
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # 卷积层
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))  # 新增卷积层的前向传播
        # 自适应池化到固定大小
        x = self.adaptive_pool(x)

        # 展平并通过全连接层
        x = x.view(-1, 256 * 6 * 8)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))  # 新增全连接层的前向传播
        x = self.fc3(x)
        return x

    def get_features(self, x):
        """提取中间特征的方法"""
        features = []

        # 存储每个卷积层的输出
        x1 = self.pool(F.relu(self.conv1(x)))
        features.append(x1)

        x2 = self.pool(F.relu(self.conv2(x1)))
        features.append(x2)

        x3 = self.pool(F.relu(self.conv3(x2)))
        features.append(x3)

        # 自适应池化后的特征
        x4 = self.adaptive_pool(x3)
        features.append(x4)

        return features
# 3. 数据集类


class MalwareDataset(Dataset):
    def __init__(self, benign_dir, malware_dir, transform=None):
        self.transform = transform
        self.data = []

        # 加载良性样本
        for img_name in os.listdir(benign_dir):
            if img_name.endswith('.png'):
                self.data.append((os.path.join(benign_dir, img_name), 0))

        # 加载恶意样本
        for img_name in os.listdir(malware_dir):
            if img_name.endswith('.png'):
                self.data.append((os.path.join(malware_dir, img_name), 1))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = Image.open(img_path).convert('L')

        if self.transform:
            image = self.transform(image)

        return image, label

# 4. 训练函数


def train_model(benign_dir, malware_dir, model_save_path, epochs=10, batch_size=32):
    # 设置设备
    device = torch.device(
        "mps" if torch.backends.mps.is_available() else "cpu")
    print(f"Using device: {device}")

    # 数据转换
    transform = transforms.Compose([
        transforms.Resize((384, 512)),
        transforms.ToTensor(),
    ])

    # 创建数据集
    dataset = MalwareDataset(benign_dir, malware_dir, transform=transform)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(
        dataset, [train_size, test_size])

    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # 创建模型
    model = MalwareDetectionCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # 训练循环
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # 验证
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Epoch {epoch + 1}, Loss: {running_loss/len(train_loader):.3f}, '
              f'Accuracy: {100 * correct / total:.2f}%')

    # 保存模型
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")


def extract_wavelet_features(image_path, wavelet='db1', level=3):
    """
    使用小波变换从图像中提取特征
    """
    # 读取图像
    img = Image.open(image_path).convert('L')
    img_array = np.array(img)

    # 执行多级二维离散小波变换
    coeffs = pywt.wavedec2(img_array, wavelet, level=level)

    # 提取特征
    features = []

    # 处理近似系数
    features.extend([
        np.mean(coeffs[0]),
        np.std(coeffs[0]),
        stats.skew(coeffs[0].ravel()),
        stats.kurtosis(coeffs[0].ravel())
    ])

    # 处理细节系数
    for detail_coeffs in coeffs[1:]:
        for detail in detail_coeffs:
            features.extend([
                np.mean(detail),
                np.std(detail),
                stats.skew(detail.ravel()),
                stats.kurtosis(detail.ravel())
            ])

    return np.array(features)

# 基于小波特征的数据集


class WaveletMalwareDataset(Dataset):
    def __init__(self, benign_dir, malware_dir):
        self.data = []

        # 加载良性样本
        for img_name in os.listdir(benign_dir):
            if img_name.endswith('.png'):
                features = extract_wavelet_features(
                    os.path.join(benign_dir, img_name))
                self.data.append((features, 0))

        # 加载恶意样本
        for img_name in os.listdir(malware_dir):
            if img_name.endswith('.png'):
                features = extract_wavelet_features(
                    os.path.join(malware_dir, img_name))
                self.data.append((features, 1))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        features, label = self.data[idx]
        return torch.FloatTensor(features), label

# 基于小波特征的神经网络模型


class WaveletMalwareDetector(nn.Module):
    def __init__(self, input_size):
        super(WaveletMalwareDetector, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# 训练小波特征模型


def train_wavelet_model(benign_dir, malware_dir, model_save_path, epochs=10, batch_size=32):
    device = torch.device(
        "mps" if torch.backends.mps.is_available() else "cpu")

    # 加载数据集
    dataset = torch.load("wavelet_dataset.pth")

    # 确定输入特征维度
    input_size = dataset[0][0].shape[0]

    # 分割训练集和测试集
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(
        dataset, [train_size, test_size])

    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # 创建模型
    model = WaveletMalwareDetector(input_size).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # 训练循环
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, (features, labels) in enumerate(train_loader):
            features, labels = features.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # 验证
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for features, labels in test_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Epoch {epoch + 1}, Loss: {running_loss/len(train_loader):.3f}, '
              f'Accuracy: {100 * correct / total:.2f}%')

    # 保存模型
    torch.save(model.state_dict(), model_save_path)
    print(f"Wavelet model saved to {model_save_path}")
    return input_size


def scan_file(cnn_model_path, wavelet_model_path, wavelet_input_size, file_path, device=None):
    """整合CNN和小波特征模型的扫描函数"""
    if device is None:
        device = torch.device(
            "mps" if torch.backends.mps.is_available() else "cpu")

    # 创建临时图像
    temp_img_path = "temp_scan.png"
    if not convert_pe_to_image(file_path, temp_img_path):
        return None, None

    # 加载CNN模型
    cnn_model = MalwareDetectionCNN().to(device)
    cnn_model.load_state_dict(torch.load(cnn_model_path))
    cnn_model.eval()

    # 加载小波特征模型
    wavelet_model = WaveletMalwareDetector(wavelet_input_size).to(device)
    wavelet_model.load_state_dict(torch.load(wavelet_model_path))
    wavelet_model.eval()

    # CNN预测
    transform = transforms.Compose([
        transforms.Resize((384, 512)),
        transforms.ToTensor(),
    ])

    image = Image.open(temp_img_path).convert('L')
    cnn_input = transform(image).unsqueeze(0).to(device)

    # 小波特征预测
    wavelet_features = extract_wavelet_features(temp_img_path)
    wavelet_input = torch.FloatTensor(wavelet_features).unsqueeze(0).to(device)

    # 获取两个模型的预测结果
    with torch.no_grad():
        cnn_outputs = cnn_model(cnn_input)
        wavelet_outputs = wavelet_model(wavelet_input)

        cnn_probs = F.softmax(cnn_outputs, dim=1)
        wavelet_probs = F.softmax(wavelet_outputs, dim=1)

        # 计算平均恶意分数
        cnn_score = cnn_probs[0][1].item()
        wavelet_score = wavelet_probs[0][1].item()
        # average_score = (2/3)*cnn_score + (1/3)*wavelet_score
        # average_score = wavelet_score
        average_score = cnn_score

        # 根据平均分数决定最终预测
        final_prediction = 1 if average_score > 0.85 else 0

    # 清理临时文件
    os.remove(temp_img_path)

    return final_prediction, average_score


def scan_directory(cnn_model_path, wavelet_model_path, wavelet_input_size, scan_dir):
    """使用两个模型扫描目录"""
    device = torch.device(
        "mps" if torch.backends.mps.is_available() else "cpu")
    total_files = 0
    detected_malware = 0

    results = []

    for filename in os.listdir(scan_dir):
        file_path = os.path.join(scan_dir, filename)
        if os.path.isfile(file_path):
            total_files += 1
            prediction, malware_score = scan_file(
                cnn_model_path,
                wavelet_model_path,
                wavelet_input_size,
                file_path,
                device
            )

            if prediction is not None:
                results.append({
                    'file': filename,
                    'is_malware': bool(prediction),
                    'malware_score': malware_score
                })

                if prediction == 1:
                    detected_malware += 1

    detection_rate = (detected_malware / total_files *
                      100) if total_files > 0 else 0
    return results, detection_rate

In [4]:
results, detection_rate = scan_directory(
    "cnn_detector.pth",
    "wavelet_detector.pth",
    40,
    "scan_files"
)

print(f"\nScan Results:")
print(f"Detection Rate: {detection_rate:.2f}%")
for result in results:
    print(f"File: {result['file']}")
    print(f"Malware Score: {result['malware_score']:.2f}")
    print(f"Classification: {'Malware' if result['is_malware'] else 'Benign'}")
    print("-" * 50)


Scan Results:
Detection Rate: 60.00%
File: Gx7.exe
Malware Score: 0.95
Classification: Malware
--------------------------------------------------
File: .DS_Store
Malware Score: 0.90
Classification: Malware
--------------------------------------------------
File: lum.ps1
Malware Score: 0.01
Classification: Benign
--------------------------------------------------
File: mma.ps1
Malware Score: 0.04
Classification: Benign
--------------------------------------------------
File: d65165279105ca6773180500688df4bdc69a2c7b771752f0a46ef120b7fd8ec3
Malware Score: 0.92
Classification: Malware
--------------------------------------------------


