In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# 1. 데이터 인코딩
# 데이터 로드
data = pd.read_csv('/home/joon/SEMI/Research/Dataset/SocialNetwork-attack-data/scenario1/plain_workload/CVE-2016-5195/collection-1/node1/attack_syscall.txt', delim_whitespace=True, header=None, names=['timestamp', 'syscall'])

# 시스템 콜 딕셔너리 생성
unique_syscalls = data['syscall'].unique()
syscall_dict = {syscall: idx for idx, syscall in enumerate(unique_syscalls)}

# 시스템 콜을 숫자로 변환
data['syscall_encoded'] = data['syscall'].map(syscall_dict)

  data = pd.read_csv('/home/joon/SEMI/Research/Dataset/SocialNetwork-attack-data/scenario1/plain_workload/CVE-2016-5195/collection-1/node1/attack_syscall.txt', delim_whitespace=True, header=None, names=['timestamp', 'syscall'])


In [3]:
data[30:60]

Unnamed: 0,timestamp,syscall,syscall_encoded
30,1700249477000134395,getpid,1
31,1700249477000134434,getpid,1
32,1700249477000134732,getrusage,2
33,1700249477000134747,getrusage,2
34,1700249477000135039,futex,0
35,1700249477000138458,getrusage,2
36,1700249477000138525,getpid,1
37,1700249477000138639,getpid,1
38,1700249477000138939,getrusage,2
39,1700249477000144032,getrusage,2


In [6]:
# 2. N-그램 생성
# 프레임 생성
def create_frames(encoded_data, frame_size):
    frames = []
    num_frames = len(encoded_data) // frame_size
    for i in range(num_frames):
        frame = encoded_data[i*frame_size:(i+1)*frame_size]
        frames.append(frame)
    return frames

frame_size = 10  # 프레임 크기 설정
frames = create_frames(data['syscall_encoded'].tolist(), frame_size)

# # 프레임을 문자열로 변환
# frame_strings = [' '.join(map(str, frame)) for frame in frames]

In [11]:
frames

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 1, 1, 0, 0, 1, 1, 0],
 [1, 1, 2, 2, 0, 2, 1, 1, 2, 2],
 [2, 0, 0, 0, 0, 1, 1, 1, 1, 3],
 [3, 1, 1, 3, 3, 4, 3, 4, 5, 4],
 [3, 4, 5, 4, 4, 5, 5, 5, 6, 6],
 [6, 6, 5, 3, 3, 3, 3, 4, 6, 6],
 [4, 4, 5, 4, 5, 3, 5, 5, 3, 4],
 [4, 5, 6, 6, 6, 6, 5, 6, 6, 1],
 [1, 7, 7, 3, 1, 1, 7, 7, 3, 3],
 [5, 3, 5, 1, 1, 7, 7, 3, 3, 5],
 [5, 5, 5, 6, 6, 3, 3, 5, 5, 5],
 [5, 6, 6, 3, 3, 5, 5, 5, 5, 6],
 [6, 3, 3, 5, 5, 5, 5, 6, 6, 3],
 [3, 5, 5, 5, 5, 6, 6, 3, 3, 5],
 [5, 5, 5, 6, 6, 5, 5, 5, 6, 6],
 [3, 5, 5, 5, 6, 6, 3, 5, 5, 5],
 [5, 6, 6, 3, 3, 5, 5, 3, 5, 5],
 [6, 6, 3, 5, 3, 0, 3, 5, 5, 5],
 [5, 6, 6, 5, 5, 5, 6, 6, 3, 3],
 [5, 3, 3, 5, 5, 5, 5, 6, 6, 5],
 [5, 5, 3, 6, 6, 3, 5, 3, 3, 5],
 [5, 5, 5, 6, 6, 5, 5, 5, 3, 6],
 [6, 3, 5, 3, 3, 5, 5, 5, 5, 6],
 [6, 5, 5, 5, 6, 6, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 8, 8, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0,

In [13]:
# 3. 빈도수 벡터 생성
# 시스템 콜 딕셔너리에서 인덱스를 가져와서 특성 이름을 결정
syscall_list = sorted(syscall_dict.values())

# 프레임 내 시스템 콜의 빈도수 계산
def compute_frequency_vector(frame, syscall_list):
    frequency = Counter(frame)
    return [frequency.get(syscall, 0) for syscall in syscall_list]

# 각 프레임에 대해 빈도수 벡터 생성
feature_vectors = np.array([compute_frequency_vector(frame, syscall_list) for frame in frames])


In [14]:
feature_vectors

array([[10,  0,  0, ...,  0,  0,  0],
       [ 6,  4,  0, ...,  0,  0,  0],
       [ 6,  4,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 4,  0,  0, ...,  0,  0,  0]])

In [15]:
# 데이터프레임으로 변환
df_features = pd.DataFrame(feature_vectors, columns=[f'syscall_{i}' for i in syscall_list])

In [16]:
df_features

Unnamed: 0,syscall_0,syscall_1,syscall_2,syscall_3,syscall_4,syscall_5,syscall_6,syscall_7,syscall_8,syscall_9,...,syscall_62,syscall_63,syscall_64,syscall_65,syscall_66,syscall_67,syscall_68,syscall_69,syscall_70,syscall_71
0,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,4,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
