In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data = pd.read_csv('malicious_website_dataset.csv')
data.head()

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,iso-8859-1,nginx,263.0,,,10/10/2015 18:21,,...,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,UTF-8,Apache/2.4.10,15087.0,,,,,...,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,324.0,,,,,...,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,ISO-8859-1,nginx,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,...,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,UTF-8,,124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,...,2,5,4278,61,62,129889,4586,61,4.0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   URL                        1781 non-null   object 
 1   URL_LENGTH                 1781 non-null   int64  
 2   NUMBER_SPECIAL_CHARACTERS  1781 non-null   int64  
 3   CHARSET                    1781 non-null   object 
 4   SERVER                     1780 non-null   object 
 5   CONTENT_LENGTH             969 non-null    float64
 6   WHOIS_COUNTRY              1781 non-null   object 
 7   WHOIS_STATEPRO             1781 non-null   object 
 8   WHOIS_REGDATE              1781 non-null   object 
 9   WHOIS_UPDATED_DATE         1781 non-null   object 
 10  TCP_CONVERSATION_EXCHANGE  1781 non-null   int64  
 11  DIST_REMOTE_TCP_PORT       1781 non-null   int64  
 12  REMOTE_IPS                 1781 non-null   int64  
 13  APP_BYTES                  1781 non-null   int64

In [49]:
# Quick statistical summary of data
data.describe(include='all')

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
count,1781,1781.0,1781.0,1781,1780,969.0,1781,1781,1781.0,1781.0,...,1781.0,1781.0,1781.0,1781.0,1781.0,1781.0,1781.0,1781.0,1780.0,1781.0
unique,1781,,,9,239,,49,182,891.0,594.0,...,,,,,,,,,,
top,B0_1229,,,UTF-8,Apache,,US,CA,,,...,,,,,,,,,,
freq,1,,,676,386,,1103,372,127.0,139.0,...,,,,,,,,,,
mean,,56.961258,11.111735,,,11726.927761,,,,,...,5.472768,3.06064,2982.339,18.540146,18.74621,15892.55,3155.599,18.540146,2.263483,0.12128
std,,27.555586,4.549896,,,36391.809051,,,,,...,21.807327,3.386975,56050.57,41.627173,46.397969,69861.93,56053.78,41.627173,2.930853,0.326544
min,,16.0,5.0,,,0.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,39.0,8.0,,,324.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,49.0,10.0,,,1853.0,,,,,...,0.0,2.0,672.0,8.0,9.0,579.0,735.0,8.0,0.0,0.0
75%,,68.0,13.0,,,11323.0,,,,,...,5.0,5.0,2328.0,26.0,25.0,9806.0,2701.0,26.0,4.0,0.0


In [98]:
def onehot_encode(df, column_dict):
    df_copy = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df_copy[column], prefix=prefix)
        df_copy = pd.concat([df_copy, dummies], axis=1)
        df_copy = df_copy.drop(column, axis=1)
    return df_copy

In [99]:
def data_cleaning(data):
    data_copy=data.copy()
    
    data_copy=data_copy.drop('URL', axis=1)
    
    for col in ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']:
        data_copy[col] = pd.to_datetime(data_copy[col], utc=True, errors='coerce')
    
    data_copy['REGYEAR']=data_copy['WHOIS_REGDATE'].apply(lambda dt: dt.year)
    data_copy['REGMONTH']=data_copy['WHOIS_REGDATE'].apply(lambda dt: dt.month)
    data_copy['REGDAY']=data_copy['WHOIS_REGDATE'].apply(lambda dt: dt.day)
    data_copy['REGHOUR']=data_copy['WHOIS_REGDATE'].apply(lambda dt: dt.hour)
    data_copy['REGMINUTE']=data_copy['WHOIS_REGDATE'].apply(lambda dt: dt.minute)
    
    data_copy['UPDATEDYEAR']=data_copy['WHOIS_UPDATED_DATE'].apply(lambda dt: dt.year)
    data_copy['UPDATEDMONTH']=data_copy['WHOIS_UPDATED_DATE'].apply(lambda dt: dt.month)
    data_copy['UPDATEDDAY']=data_copy['WHOIS_UPDATED_DATE'].apply(lambda dt: dt.day)
    data_copy['UPDATEDHOUR']=data_copy['WHOIS_UPDATED_DATE'].apply(lambda dt: dt.hour)
    data_copy['UPDATEDMINUTE']=data_copy['WHOIS_UPDATED_DATE'].apply(lambda dt: dt.minute)
    
    data_copy = data_copy.drop(['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'], axis=1)
        

    data_copy = data_copy.select_dtypes(include='int64').fillna(data_copy.mean())
    
    for column in ['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']:
        data[column] = data[column].apply(lambda x: x.lower() if str(x) != 'nan' else x)

    encoded_df = onehot_encode(
        data[['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']],
        column_dict={
            'CHARSET': 'CH',
            'SERVER': 'SV',
            'WHOIS_COUNTRY': 'WC',
            'WHOIS_STATEPRO': 'WS'
        }
    )
    
    columns_to_scale  = ['URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'TCP_CONVERSATION_EXCHANGE',
       'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS',
       'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES',
       'APP_PACKETS']

    sc = StandardScaler()
    clean_df_sc = sc.fit_transform(clean_df[columns_to_scale])
    scaled_clean_df = pd.DataFrame(clean_df_sc, index=clean_df.index, columns=columns_to_scale)
    
    final_df = pd.concat([scaled_clean_df, encoded_df, clean_df['Type']], axis=1)

    return final_df

In [89]:
def evaluate_model(model, X_test, y_test):
    
    model_acc = model.score(X_test, y_test)
    print("Test Accuracy: {:.2f}%".format(model_acc * 100))
    
    y_true = np.array(y_test)
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_true, y_pred)
    clr = classification_report(y_true, y_pred, target_names=["BENIGN", "MALIGNANT"])
    
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
    plt.xticks(np.arange(2) + 0.5, ["BENIGN", "MALIGNANT"])
    plt.yticks(np.arange(2) + 0.5, ["BENIGN", "MALIGNANT"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

In [111]:
clean_data = data_cleaning(data)
clean_data.head()

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,...,WS_west midlands,WS_wi,WS_widestep@mail.ru,WS_wisconsin,WS_worcs,WS_wv,WS_zh,WS_zhejiang,WS_zug,Type
0,-1.486913,-0.903952,-0.228728,-0.251031,-0.313241,-0.040731,-0.229245,-0.188557,-0.21104,-0.041465,...,0,0,0,0,0,0,0,0,0,1
1,-1.486913,-1.123799,0.018249,0.070053,0.277423,-0.031272,-0.037009,0.005471,-0.209437,-0.034362,...,0,0,0,0,0,0,0,0,0,0
2,-1.486913,-1.123799,-0.401611,-0.251031,-0.903904,-0.053223,-0.445511,-0.404144,-0.227549,-0.056312,...,0,0,0,0,0,0,0,0,0,0
3,-1.450613,-1.123799,0.364017,0.758088,-0.017909,0.014806,0.49164,0.393528,0.0414,0.021849,...,0,0,0,0,0,0,0,0,0,0
4,-1.450613,-1.123799,1.006157,-0.159292,0.572754,0.023122,1.02029,0.932496,1.632198,0.025526,...,0,0,0,0,0,0,0,0,0,0


In [115]:
x = clean_data.drop('Type', axis=1).values
Y = clean_data['Type'].values

x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size = 0.2,stratify=Y,random_state = 100)

From the table, we can see that there are some columns which contain unique values, especially URL that is totally unique.

For example:

URL (100% unique)
WHOIS_REGDATE (50% unique)
WHOIS_UPDATED_DATE (33% unique)

In [116]:
x_train[0]

array([-0.28899744, -0.24441157, -0.40161148, -0.25103055, -0.90390376,
       -0.05322295, -0.44551075, -0.40414431, -0.22754896, -0.05631172,
       -0.44551075,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [117]:
y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [118]:
model = LogisticRegression(
    class_weight={
        0: 1,
        1: 0.2
    }
)

model.fit(x_train, y_train)

LogisticRegression(class_weight={0: 1, 1: 0.2})

## Data Preprocessing

In [8]:
data

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,iso-8859-1,nginx,263.0,,,10/10/2015 18:21,,...,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,UTF-8,Apache/2.4.10,15087.0,,,,,...,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,324.0,,,,,...,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,ISO-8859-1,nginx,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,...,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,UTF-8,,124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,...,2,5,4278,61,62,129889,4586,61,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776,M4_48,194,16,UTF-8,Apache,,ES,Barcelona,17/09/2008 0:00,2/09/2016 0:00,...,0,0,0,0,3,186,0,0,0.0,1
1777,M4_41,198,17,UTF-8,Apache,,ES,Barcelona,17/09/2008 0:00,2/09/2016 0:00,...,0,0,0,0,2,124,0,0,0.0,1
1778,B0_162,201,34,utf-8,Apache/2.2.16 (Debian),8904.0,US,FL,15/02/1999 0:00,15/07/2015 0:00,...,2,6,6631,87,89,132181,6945,87,4.0,0
1779,B0_1152,234,34,ISO-8859-1,cloudflare-nginx,,US,CA,1/04/1998 0:00,9/12/2016 0:00,...,0,0,0,0,0,0,0,0,0.0,0


In [11]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

TypeError: can only concatenate str (not "int") to str