# Cisco - Ariel University API Security Detection Challenge 2023
## Lable 1 code


### Imports and global settings

In [None]:
# Imports, settings and first dataset view
import pandas as pd
import seaborn as sns
import numpy as np
import json

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter


# Set pandas to show all columns when you print a dataframe
pd.set_option('display.max_columns', None)

dataset_number = 1
test_type = 'label'

# Read the json and read it to a pandas dataframe object
with open(f'./datasets/dataset_{str(dataset_number)}_train.json') as file:
    raw_ds = json.load(file)
df = pd.json_normalize(raw_ds, max_level=2)

# Shoe the first five lines of the dataframe to see if everything was read accordingly 
df.head()

### Basic dataset label arrangements

In [None]:
from urllib.parse import urlparse

# Extracts the path from the url
def url_path(row):
    parsed = urlparse(row["request.url"])
    return parsed.path

# Extracts the queries from the url
def url_query(row):
    parsed = urlparse(row["request.url"])
    return parsed.query

# Extracts the length of the url
def url_len(row):
    return len(row["request.url"])

# Checks if any of the suspected headers contains a dollar sign
def has_dollarsign(row):
    if "$" in row["request.headers.Accept-Encoding"]:
        return True
    elif "$" in row["request.headers.Sec-Fetch-Site"]:
        return True
    elif "$" in row["request.headers.Sec-Fetch-Dest"]:
        return True
    elif "$" in row["request.headers.Set-Cookie"]:
        return True
    return False

In [None]:
# Fill the black attack tag lines with "Benign" string
df['request.Attack_Tag'] = df['request.Attack_Tag'].fillna('Benign')
df['attack_type'] = df['request.Attack_Tag']

# This function will be used in the lambda below to iterate over the label columns 
def categorize(row):  
    if row['request.Attack_Tag'] == 'Benign':
        return 'Benign'
    return 'Malware'

df['label'] = df.apply(lambda row: categorize(row), axis=1)

# make new columns for the features of: url length, url path, url query, and if the suspected headers contains a dollar sign
df['url_length'] = df.apply(lambda row: url_len(row), axis=1)
df["url_path"] = df.apply(lambda row: url_path(row), axis=1)
df["url_query"] = df.apply(lambda row: url_query(row), axis=1)
df["has_dollarsign"] = df.apply(lambda row: has_dollarsign(row), axis=1)

# After finishing the arrangements we delete the irrelevant column
df.drop('request.Attack_Tag', axis=1, inplace=True)
df.drop('request.url', axis=1, inplace=True)

df

In [None]:
# Remove all NAN columns or replace with desired string
# This loop iterates over all of the column names which are all NaN
for column in df.columns[df.isna().any()].tolist():
    df[column] = df[column].fillna('None')
    
df.head()

In [None]:
# This is a list of the headers that we want to remove from our model
COLUMNS_TO_REMOVE = ['request.body',
                    'response.headers.Content-Length',
                    'request.headers.Date', 
                    "request.headers.Sec-Fetch-Dest", 
                    "request.headers.Connection", 
                    "request.headers.Accept", 
                    "request.headers.Sec-Fetch-User",
                    "request.headers.Host",
                    "response.headers.Set-Cookie", 
                    "response.status"]

# This is our main preprocessing function that will iterate over all of the chosen 
# columns and run some feature extraction models
def vectorize_df(df):
    le = LabelEncoder()

    # Run LabelEncoder on the chosen features
    for column in df.columns.to_list(): 
        df[column] = le.fit_transform(df[column])
    
    # Remove some columns that may be needed for the model
    for column in COLUMNS_TO_REMOVE: 
        df.drop(column, axis=1, inplace=True)
    return df

df = vectorize_df(df)
df.head()

In [None]:
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=False)

In [None]:
# Memory check (For large datasets sometimes the dataframe will exceed the computers resources)
df.info(memory_usage="deep")

In [None]:
# Choose the right features
# In our example code we choose all the columns as our feature

features_list = df.columns.to_list()
features_list.remove('label')
features_list.remove('attack_type')
print(features_list)

# Recheck all datatype before training to see we don't have any objects in our features
# In this example our model must get features containing only numbers so we recheck to see if we missed anything during preprocessing
df.dtypes

## Train test split

In [None]:
# Data train and test split preparations. Here we will insert our feature list and label list.
# Afterwards the data will be trained and fitted on the amazing RandomForest model
# X_Train and y_Train will be used for training
# X_test and y_test will be used for over fitting checking and overall score testing

# We convert the feature list to a numpy array, this is required for the model fitting
X = df[features_list].to_numpy()

# This column is the desired prediction we will train our model on
y = np.stack(df[test_type])

# We split the dataset to train and test according to the required ration
# Do not change the test_size -> you can change anything else
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1765, random_state=42, stratify=y)

# We print the resulted datasets and count the difference 
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
counter = Counter(y)
counter

## Model choosing and fitting

In [None]:
# We train the model on the train dataset
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Check data balance and variety
print(sorted(Counter(y_train).items()))


## Result printout

In [None]:
# We print our results
sns.set(rc={'figure.figsize':(15,8)})

predictions = clf.predict(X_test)

true_labels = y_test
clf_matrix = confusion_matrix(true_labels, predictions)
clf_report = classification_report(true_labels, predictions, digits=5)
heatmap = sns.heatmap(clf_matrix, annot=True, cmap='Blues', fmt='g', 
                      xticklabels=np.unique(true_labels), 
                      yticklabels=np.unique(true_labels)) 

# The heatmap is cool but this is the most important result
print(clf_report)

# Test

In [None]:
# Read the valuation json, preprocess it and run the model 
with open(f'./datasets/dataset_{str(dataset_number)}_val.json') as file:
    raw_ds = json.load(file)
test_df = pd.json_normalize(raw_ds, max_level=2)

# Preprocess the validation dataset
test_df['url_length'] = test_df.apply(lambda row: url_len(row), axis=1)
test_df["url_path"] = test_df.apply(lambda row: url_path(row), axis=1)
test_df["url_query"] = test_df.apply(lambda row: url_query(row), axis=1)
test_df["has_dollarsign"] = test_df.apply(lambda row: has_dollarsign(row), axis=1)
for column in test_df.columns[test_df.isna().any()].tolist():
    test_df[column] = test_df[column].fillna('None')
test_df = vectorize_df(test_df)

# Predict with the model
X = test_df[features_list].to_numpy()
predictions = clf.predict(X)

# Save your predictions

In [None]:
# Save the preditions
enc = LabelEncoder()
np.savetxt(f'./datasets/dataset_{str(dataset_number)}_{test_type}_result.txt', enc.fit_transform(predictions), fmt='%2d')