In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [13]:
data = pd.read_csv('dataset.csv')
data.head()

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,iso-8859-1,nginx,263.0,,,10/10/2015 18:21,,...,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,UTF-8,Apache/2.4.10,15087.0,,,,,...,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,324.0,,,,,...,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,ISO-8859-1,nginx,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,...,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,UTF-8,,124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,...,2,5,4278,61,62,129889,4586,61,4.0,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   URL                        1781 non-null   object 
 1   URL_LENGTH                 1781 non-null   int64  
 2   NUMBER_SPECIAL_CHARACTERS  1781 non-null   int64  
 3   CHARSET                    1781 non-null   object 
 4   SERVER                     1780 non-null   object 
 5   CONTENT_LENGTH             969 non-null    float64
 6   WHOIS_COUNTRY              1781 non-null   object 
 7   WHOIS_STATEPRO             1781 non-null   object 
 8   WHOIS_REGDATE              1781 non-null   object 
 9   WHOIS_UPDATED_DATE         1781 non-null   object 
 10  TCP_CONVERSATION_EXCHANGE  1781 non-null   int64  
 11  DIST_REMOTE_TCP_PORT       1781 non-null   int64  
 12  REMOTE_IPS                 1781 non-null   int64  
 13  APP_BYTES                  1781 non-null   int64

In [15]:
# Quick statistical summary of data
data.describe(include='all')

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
count,1781,1781.0,1781.0,1781,1780,969.0,1781,1781,1781.0,1781.0,...,1781.0,1781.0,1781.0,1781.0,1781.0,1781.0,1781.0,1781.0,1780.0,1781.0
unique,1781,,,9,239,,49,182,891.0,594.0,...,,,,,,,,,,
top,B0_2077,,,UTF-8,Apache,,US,CA,,,...,,,,,,,,,,
freq,1,,,676,386,,1103,372,127.0,139.0,...,,,,,,,,,,
mean,,56.961258,11.111735,,,11726.927761,,,,,...,5.472768,3.06064,2982.339,18.540146,18.74621,15892.55,3155.599,18.540146,2.263483,0.12128
std,,27.555586,4.549896,,,36391.809051,,,,,...,21.807327,3.386975,56050.57,41.627173,46.397969,69861.93,56053.78,41.627173,2.930853,0.326544
min,,16.0,5.0,,,0.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,39.0,8.0,,,324.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,49.0,10.0,,,1853.0,,,,,...,0.0,2.0,672.0,8.0,9.0,579.0,735.0,8.0,0.0,0.0
75%,,68.0,13.0,,,11323.0,,,,,...,5.0,5.0,2328.0,26.0,25.0,9806.0,2701.0,26.0,4.0,0.0


From the table, we can see that there are some columns which contain unique values, especially URL that is totally unique.

For example:

URL (100% unique)
WHOIS_REGDATE (50% unique)
WHOIS_UPDATED_DATE (33% unique)

In [18]:
def onehot_encode(df, column_dict):
    df_copy = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df_copy

In [17]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop URL column
    df = df.drop('URL', axis=1)
    
    # Extract datetime features
    for column in ['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE']:
        df[column] = pd.to_datetime(df[column], errors='coerce')
    
    df['REG_YEAR'] = df['WHOIS_REGDATE'].apply(lambda x: x.year)
    df['REG_MONTH'] = df['WHOIS_REGDATE'].apply(lambda x: x.month)
    df['REG_DAY'] = df['WHOIS_REGDATE'].apply(lambda x: x.day)
    df['REG_HOUR'] = df['WHOIS_REGDATE'].apply(lambda x: x.hour)
    df['REG_MINUTE'] = df['WHOIS_REGDATE'].apply(lambda x: x.minute)
    
    df['UPD_YEAR'] = df['WHOIS_UPDATED_DATE'].apply(lambda x: x.year)
    df['UPD_MONTH'] = df['WHOIS_UPDATED_DATE'].apply(lambda x: x.month)
    df['UPD_DAY'] = df['WHOIS_UPDATED_DATE'].apply(lambda x: x.day)
    df['UPD_HOUR'] = df['WHOIS_UPDATED_DATE'].apply(lambda x: x.hour)
    df['UPD_MINUTE'] = df['WHOIS_UPDATED_DATE'].apply(lambda x: x.minute)
    
    df = df.drop(['WHOIS_REGDATE', 'WHOIS_UPDATED_DATE'], axis=1)
    
    
    # One-hot encode categorical features
    for column in ['CHARSET', 'SERVER', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO']:
        df[column] = df[column].apply(lambda x: x.lower() if str(x) != 'nan' else x)
    
    df = onehot_encode(
        df,
        column_dict={
            'CHARSET': 'CH',
            'SERVER': 'SV',
            'WHOIS_COUNTRY': 'WC',
            'WHOIS_STATEPRO': 'WS'
        }
    )
    
    # Fill missing values
    missing_value_columns = df.columns[df.isna().sum() > 0]
    
    for column in missing_value_columns:
        df[column] = df[column].fillna(df[column].mean())
    
    # Split df into X and y
    y = df['Type'].copy()
    X = df.drop('Type', axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    # Remove feature columns that were reduced to single-value columns during the train-test split
    single_value_columns = X_train.columns[[len(X_train[column].unique()) == 1 for column in X_train.columns]]
    
    X_train = X_train.drop(single_value_columns, axis=1)
    X_test = X_test.drop(single_value_columns, axis=1)
    
    return X_train, X_test, y_train, y_test

In [19]:
def evaluate_model(model, X_test, y_test):
    
    model_acc = model.score(X_test, y_test)
    print("Test Accuracy: {:.2f}%".format(model_acc * 100))
    
    y_true = np.array(y_test)
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_true, y_pred)
    clr = classification_report(y_true, y_pred, target_names=["BENIGN", "MALIGNANT"])
    
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
    plt.xticks(np.arange(2) + 0.5, ["BENIGN", "MALIGNANT"])
    plt.yticks(np.arange(2) + 0.5, ["BENIGN", "MALIGNANT"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

## Data Preprocessing

In [20]:
data

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,iso-8859-1,nginx,263.0,,,10/10/2015 18:21,,...,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,UTF-8,Apache/2.4.10,15087.0,,,,,...,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,324.0,,,,,...,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,ISO-8859-1,nginx,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,...,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,UTF-8,,124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,...,2,5,4278,61,62,129889,4586,61,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776,M4_48,194,16,UTF-8,Apache,,ES,Barcelona,17/09/2008 0:00,2/09/2016 0:00,...,0,0,0,0,3,186,0,0,0.0,1
1777,M4_41,198,17,UTF-8,Apache,,ES,Barcelona,17/09/2008 0:00,2/09/2016 0:00,...,0,0,0,0,2,124,0,0,0.0,1
1778,B0_162,201,34,utf-8,Apache/2.2.16 (Debian),8904.0,US,FL,15/02/1999 0:00,15/07/2015 0:00,...,2,6,6631,87,89,132181,6945,87,4.0,0
1779,B0_1152,234,34,ISO-8859-1,cloudflare-nginx,,US,CA,1/04/1998 0:00,9/12/2016 0:00,...,0,0,0,0,0,0,0,0,0.0,0


In [21]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

TypeError: can only concatenate str (not "int") to str