In [None]:
import numpy as np
import pandas as pd
import zat
from zat.log_to_dataframe import LogToDataFrame
from zat.dataframe_to_matrix import DataFrameToMatrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn import metrics
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Read log in python
def data_select(path1,path2,path3):
    log_to_df = LogToDataFrame()
    df_conn = log_to_df.create_dataframe(path1)
    df_ssl = log_to_df.create_dataframe(path2)
    df_flow = log_to_df.create_dataframe(path3)
    print('Read in conn {:d} Rows...'.format(len(df_conn)))
    print('Read in ssl {:d} Rows...'.format(len(df_ssl)))
    print('Read in flowmeter {:d} Rows...'.format(len(df_flow)))
    
    # Feature selection
    df_conn['uid_length'] =df_conn['uid'].str.len()
    features_conn = ['uid','orig_bytes','service', 'resp_bytes','conn_state',
                      'missed_bytes','orig_pkts','orig_ip_bytes','resp_pkts','resp_ip_bytes']
    feature_df_conn = df_conn[features_conn]

    df_ssl['uid_length'] = df_ssl['uid'].str.len()
    features_ssl = ['uid','curve','resumed','established','client_cert_chain_fuids','version',
                     'cipher','subject','issuer']
    feature_df_ssl= df_ssl[features_ssl]

    df_flow['uid_length'] = df_flow['uid'].str.len()
    features_flow = ['uid','flow_duration','fwd_pkts_tot','bwd_pkts_tot','fwd_data_pkts_tot','bwd_data_pkts_tot','fwd_pkts_per_sec','bwd_pkts_per_sec','flow_pkts_per_sec',
             'down_up_ratio','fwd_header_size_tot','fwd_header_size_min','fwd_header_size_max','bwd_header_size_tot','bwd_header_size_min','bwd_header_size_max',
             'flow_FIN_flag_count','flow_SYN_flag_count','flow_RST_flag_count','fwd_PSH_flag_count','bwd_PSH_flag_count','flow_ACK_flag_count',
             'fwd_URG_flag_count','bwd_URG_flag_count','flow_CWR_flag_count','flow_ECE_flag_count',
             'fwd_pkts_payload.max','fwd_pkts_payload.min','fwd_pkts_payload.tot','fwd_pkts_payload.avg','fwd_pkts_payload.std',
             'bwd_pkts_payload.max','bwd_pkts_payload.min','bwd_pkts_payload.tot','bwd_pkts_payload.avg','bwd_pkts_payload.std',
             'flow_pkts_payload.min','flow_pkts_payload.max','flow_pkts_payload.tot','flow_pkts_payload.avg','flow_pkts_payload.std',
             'fwd_iat.min','fwd_iat.max', 'fwd_iat.tot','fwd_iat.avg','fwd_iat.std','bwd_iat.max','bwd_iat.min','bwd_iat.tot','bwd_iat.avg','bwd_iat.std',
             'flow_iat.min','flow_iat.max','flow_iat.tot','flow_iat.avg','flow_iat.std','payload_bytes_per_second','fwd_subflow_pkts','bwd_subflow_pkts','fwd_subflow_bytes','bwd_subflow_bytes',
             'fwd_bulk_bytes','bwd_bulk_bytes','fwd_bulk_packets','bwd_bulk_packets','fwd_bulk_rate','bwd_bulk_rate','active.min','active.max','active.tot','active.avg','active.std',
             'idle.min','idle.max','idle.tot','idle.avg','idle.std','fwd_init_window_size','bwd_init_window_size','fwd_last_window_size','bwd_last_window_size']
    feature_df_flow = df_flow[features_flow]
    # merge features with uid
    df_f1 =  pd.merge(feature_df_flow,feature_df_conn,how='outer',on='uid')
    df_fsm=  pd.merge(df_f1,feature_df_ssl,how='outer',on='uid')
    # only TLS flows
    df_onlytls = df_fsm.dropna(subset=['version'])
    # make sure a complete TLS connection
    df_onlytls1 = df_onlytls.query("established == 'T'")
    print(df_onlytls.shape,df_onlytls1.shape)
    return df_onlytls1

In [None]:
# give the path of each log file
path1 = r".../conn.log"
path2 = r".../ssl.log"
path3 = r".../flowmeter.log"

# malware family

# Dridex

In [None]:
Dridex = data_select(path1,path2,path3)

In [None]:
Dridex = Dridex.iloc[:4969,:]

# Tickbot

In [None]:
Tickbot = data_select(path1,path2,path3)

In [None]:
Tickbot = Tickbot.iloc[:5045,:]

# T-Rasftuby

In [None]:
TRasftuby = data_select(path1,path2,path3)

# Dyname

In [None]:
Dyname = data_select(path1,path2,path3)

In [None]:
Dyname = Dyname.iloc[:5154,:]

# Bunitu

In [None]:
Bunitu = data_select(path1,path2,path3)

# Cobalt

In [None]:
Cobalt = data_select(path1,path2,path3)

# Yakes

In [None]:
Yakes = data_select(path1,path2,path3)

# Normal

In [None]:
Normal = data_select(path1,path2,path3)

In [None]:
Normal = Normal.iloc[:20000,:]

# Create feature set

In [None]:
df_malware = pd.concat([Dridex,Tickbot,TRasftuby,Dyname,Bunitu,Cobalt,Yakes],axis=0)
df_normal = Normal
df = pd.concat([df_malware,df_normal],axis=0)
print('Malware size: {:d}'.format(len(df_malware)))
print('Normal size: {:d}'.format(len(df_normal)))

# labeled

In [None]:
y = np.hstack((np.full((1,len(df_malware)),-1),np.full((1,len(df_normal)),1))).T
y = y.ravel()

# Feature standardization

In [None]:
to_matrix = zat.dataframe_to_matrix.DataFrameToMatrix()
x1 = to_matrix.fit_transform(df.drop(['uid','cipher','subject','issuer'],axis=1))

# encoding

In [None]:
df_cipher = df['cipher']
ccv = CountVectorizer()
ccv_fit=ccv.fit_transform(df_cipher)
x2 = ccv_fit.toarray()
print(x2.shape)

In [None]:
# TF-IDF
df_subject = df['subject'].values.astype('U')
stopWord_list = [',','CN=','OU=','O=','L=','ST=','C=']
tfidf = TfidfVectorizer(max_df =0.9,min_df = 0.003,stop_words=stopWord_list)
arr_subject = tfidf.fit_transform(df_subject).toarray()
print(arr_subject.shape,'\n',arr_subject)
x3 = arr_subject

In [None]:
# TF-IDF
df_issuer = df['issuer'].values.astype('U')
stopWord_list = [',','CN=','OU=','O=','L=','ST=','C=']
tfidf = TfidfVectorizer(max_df =0.9,min_df = 0.003,stop_words=stopWord_list)
arr_issuer = tfidf.fit_transform(df_issuer).toarray()
print(arr_issuer.shape,'\n',arr_issuer)
x4 = arr_issuer

# Merge

In [None]:
X = np.concatenate((x1,x2,x3,x4),axis=1)

# save as .csv

In [None]:
np.savetxt('M7_2w.csv',X)
np.savetxt('y4w.csv',y)