In [8]:
import sklearn as sk;
import pandas as pd;
import numpy as np;
import seaborn as sns;
import matplotlib.pyplot as plt;
import tqdm;
import os;
import sys;

In [9]:
if os.path.isdir(r'C:\Users\User\Desktop\Data-mining-group-project'):
    os.chdir(r'C:\Users\User\Desktop\Data-mining-group-project') #change to training data directory
try:
    train = pd.read_csv(open('UNSW_NB15_training-set.csv'))
    test = pd.read_csv(open('UNSW_NB15_testing-set.csv'))
except FileNotFoundError as e:
    print(f"error: {e}\nTry changing the training data directory in 'os.chdir'")

In [10]:
print(train.shape, test.shape)
# 45 attributes

(175341, 45) (82332, 45)


In [11]:
train.isnull().any(axis=1).sum()

np.int64(0)

In [12]:
test.isnull().any(axis=1).sum()

np.int64(0)

In [13]:
def show_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (lower_bound, upper_bound)

# data cleaning (replacement)
def replace_outliers_iqr(df, cols):
    df = df.copy()
    for col in cols:
        if col in df.columns:
            lower_bound, upper_bound = show_outliers_iqr(df[col])
            df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    return df

# data filtering
def remove_outliers_iqr(df, cols):
    df = df.copy()
    overall_mask = pd.Series(True, index=df.index)

    for col in cols:
        if col in df.columns:
            lower_bound, upper_bound = show_outliers_iqr(df[col])
            col_mask = (df[col] >= lower_bound) & (df[col] <= upper_bound)
            overall_mask = overall_mask & col_mask

    df = df[overall_mask]
    return df

def risky_show_shape(mode=2):
    if mode == 1:
        print("train/test")
        print(train.shape, test.shape)
    else:
        for a in ["x", "y"]:
            print(f"{a}_train/test")
            exec(f"print({a}_train.shape, {a}_test.shape)")

def run_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    return y_pred

def bin_decision(series, threshold=0.2):
    keep = []

    val_count = series.value_counts()
    for col in val_count.index:
        if val_count[col] / series.shape[0] > threshold:
            keep.append(col)
    return keep

def binning(df, col, values):
    df = df.copy()

    for val in values:
        df[col].replace(val, 'Others')
    return df

def col_encoder(df, col):
    df = df.copy()

    encoded_col = pd.get_dummies(df[col], prefix=col)
    df_encoded = pd.concat([df.drop(col, axis=1), encoded_col], axis=1)
    return df_encoded

message = """
show_outliers_iqr(series) -
return an array where index=0 is the lower bound and index=1 is the upper bound.

replace_outliers_iqr(df, cols) -
creates a copy of "df". replace the samples with attributes > upper_bound or < lower_bound with upper_bound or lower_bound.

remove_outliers_iqr(df, cols) -
creates a copy of "df". remove the samples with attributes > upper_bound or < lower_bound.

risky_show_shape(mode=3) -
print the shape of x_train, x_test, y_train, y_test
mode1 - unsplit
mode2 - split

run_model(model, x_train, y_train, x_test, y_test) -
runs the model and returns the prediction.

bin_decision(series, threshold=0.2) -
get a list of value that represent less than x% of the total.

binning(df, col, values) -
replace a list of values with 'others' for one hot encoding later on.
"""
print(message)


show_outliers_iqr(series) - 
return an array where index=0 is the lower bound and index=1 is the upper bound.

replace_outliers_iqr(df, cols) - 
creates a copy of "df". replace the samples with attributes > upper_bound or < lower_bound with upper_bound or lower_bound.

remove_outliers_iqr(df, cols) - 
creates a copy of "df". remove the samples with attributes > upper_bound or < lower_bound.

risky_show_shape(mode=3) - 
print the shape of x_train, x_test, y_train, y_test
mode1 - unsplit 
mode2 - split

run_model(model, x_train, y_train, x_test, y_test) -
runs the model and returns the prediction.

bin_decision(series, threshold=0.2) - 
get a list of value that represent less than x% of the total.

binning(df, col, values) -
replace a list of values with 'others' for one hot encoding later on.



In [14]:
# checking if there are any outliers
for col in train.columns:
    if train[col].dtype == 'float64':
        lower_bound, upper_bound = show_outliers_iqr(train[col])
        lower_outliers_count = (train[col] < lower_bound).sum()
        upper_outliers_count = (train[col] > upper_bound).sum()
        print(f"Column '{col}':")
        print(f"  Lower bound outlier percentage : {lower_outliers_count/len(train[col]):.2f}")
        print(f"  Upper bound outlier percentage : {upper_outliers_count/len(train[col]):.2f}")

Column 'dur':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.09
Column 'rate':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.10
Column 'sload':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.08
Column 'dload':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.22
Column 'sinpkt':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.08
Column 'dinpkt':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.08
Column 'sjit':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.10
Column 'djit':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.12
Column 'tcprtt':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.09
Column 'synack':
  Lower bound outlier percentage : 0.00
  Upper bound outlier percentage : 0.18
Column 'ackdat':
  Lower bound outlier pe

In [15]:
# synack and dload has a large portion of outliers, will perform clipping rather than filtering
train = replace_outliers_iqr(train, ['synack', 'dload'])
test = replace_outliers_iqr(test, ['synack', 'dload'])

print("Outliers in 'synack' and 'dload' columns capped in both train and test dataframes.")
risky_show_shape(1)

Outliers in 'synack' and 'dload' columns capped in both train and test dataframes.
train/test
(175341, 45) (82332, 45)


In [16]:
numerical_cols_for_filtering = [col for col in train.select_dtypes(include=np.number).columns if col not in ['id', 'label', 'synack', 'dload']]
numerical_cols_for_filtering_test = [col for col in test.select_dtypes(include=np.number).columns if col not in ['id', 'label', 'synack', 'dload']]

train = remove_outliers_iqr(train, numerical_cols_for_filtering)
test = remove_outliers_iqr(test, numerical_cols_for_filtering_test)

print("Outliers filtered in both train and test dataframes.")
risky_show_shape(1)

Outliers filtered in both train and test dataframes.
train/test
(50055, 45) (30280, 45)


In [17]:
# import files
x_train, y_train = train.iloc[:, :-2].sample(frac=1, random_state=42), train.iloc[:, -1:].sample(frac=1, random_state=42)
x_test, y_test = test.iloc[:, :-2].sample(frac=1, random_state=42), test.iloc[:, -1:].sample(frac=1, random_state=42)
risky_show_shape(1)

train/test
(50055, 45) (30280, 45)


In [18]:
non_float_cols = x_train.select_dtypes(exclude=['float64', 'int64']).copy()
print(non_float_cols.dtypes)

proto      object
service    object
state      object
dtype: object


In [19]:
non_float_col_values = non_float_cols.columns
for col in non_float_col_values:
    print(f"Value counts for column '{col}':\n{non_float_cols[col].value_counts()}\n")

Value counts for column 'proto':
proto
udp      18161
tcp      12648
unas      9354
sctp       534
any        300
         ...  
scps        20
emcon       20
arp         17
igmp        14
icmp        11
Name: count, Length: 122, dtype: int64

Value counts for column 'service':
service
-           39836
dns          9171
http          432
ftp           257
ftp-data      231
snmp           54
smtp           37
ssl            15
irc             9
dhcp            9
radius          4
Name: count, dtype: int64

Value counts for column 'state':
state
INT    26101
FIN    12309
CON    11574
RST       60
ECO        9
no         1
URN        1
Name: count, dtype: int64



In [20]:
# Binning values

binned_x_train = x_train.copy()
binned_x_test = x_test.copy()

try:
    for col in non_float_col_values:
        print(col)
        # Ensure we only try to bin columns that are actually features and present in non_float_cols
        if col in non_float_cols.columns: # Added check to prevent KeyError if non_float_cols doesn't have it
            values = bin_decision(non_float_cols[col])
            binned_x_train = binning(binned_x_train, col, values)
            binned_x_test = binning(binned_x_test, col, values)
        else:
            print(f"Warning: Column '{col}' not found in non_float_cols for bin_decision. Skipping.")

    x_train = binned_x_train # Assign binned data back to global x_train
    x_test = binned_x_test   # Assign binned data back to global x_test

    risky_show_shape()
except Exception as e:
    print(e)
    print("Error in binning")

proto
service
state
x_train/test
(50055, 43) (30280, 43)
y_train/test
(50055, 1) (30280, 1)


In [21]:
# one hot encoding
encoded_x_train = binned_x_train.copy()
encoded_x_test = binned_x_test.copy()

for col in non_float_col_values:
    encoded_x_train = col_encoder(encoded_x_train, col)
    encoded_x_test = col_encoder(encoded_x_test, col)

risky_show_shape()

x_train/test
(50055, 43) (30280, 43)
y_train/test
(50055, 1) (30280, 1)


In [22]:
# init
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# from sklearn.preprocessing import OneHotEncoder

# ohe = OneHotEncoder(sparse_output=False)
# ohe_x_train = ohe.fit_transform(encoded_x_train)
# ohe_y_train = ohe.fit_transform(y_train)
# ohe_x_test = ohe.fit_transform(encoded_x_test)
# ohe_y_test = ohe.fit_transform(y_test)
hyperparameters = {}

In [1]:
# Random Forest classifier
param_grid = {
    'max_depth': [10,30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(encoded_x_train, y_train)
hyperparameters['Random_Forest'] = grid_search.best_estimator_

NameError: name 'GridSearchCV' is not defined

In [None]:
print(grid_search.best_params_)

Spark lib

In [23]:
# init
import pyspark as s;

# Task
Instantiate a Spark `RandomForestClassifier` model using `maxDepth=30`, `minInstancesPerNode=1`, and `subsamplingRate=1.0`, then print the instantiated model to verify its configuration.

## Train Spark RandomForestClassifier with specified parameters

### Subtask:
Instantiate `pyspark.ml.classification.RandomForestClassifier` with `maxDepth=30`, `minInstancesPerNode=1`, and `subsamplingRate=1.0` using the optimal parameters provided by the user. Note that `min_samples_split` from scikit-learn does not have a direct PySpark equivalent but is implicitly handled by other parameters.


**Reasoning**:
To instantiate the Spark RandomForestClassifier, I need to first import the required class from `pyspark.ml.classification`.



In [4]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import SparkSession

# Initialize SparkSession if not already initialized
spark = SparkSession.builder.appName("RandomForestClassifier").getOrCreate()

spark_rf_classifier = RandomForestClassifier(maxDepth=30, minInstancesPerNode=1, subsamplingRate=1.0)
print("Spark RandomForestClassifier instantiated successfully.")

Spark RandomForestClassifier instantiated successfully.


In [24]:
x_train_spark = spark.createDataFrame(encoded_x_train)
y_train_spark = spark.createDataFrame(y_train).withColumnRenamed(y_train.columns[0], 'label')
x_test_spark = spark.createDataFrame(encoded_x_test)
y_test_spark = spark.createDataFrame(y_test).withColumnRenamed(y_test.columns[0], 'label')

print("Pandas DataFrames converted to Spark DataFrames and y columns renamed to 'label'.")
print(f"x_train_spark schema: {x_train_spark.printSchema()}")
print(f"y_train_spark schema: {y_train_spark.printSchema()}")
print(f"x_test_spark schema: {x_test_spark.printSchema()}")
print(f"y_test_spark schema: {y_test_spark.printSchema()}")

Pandas DataFrames converted to Spark DataFrames and y columns renamed to 'label'.
root
 |-- id: long (nullable = true)
 |-- dur: double (nullable = true)
 |-- spkts: long (nullable = true)
 |-- dpkts: long (nullable = true)
 |-- sbytes: long (nullable = true)
 |-- dbytes: long (nullable = true)
 |-- rate: double (nullable = true)
 |-- sttl: long (nullable = true)
 |-- dttl: long (nullable = true)
 |-- sload: double (nullable = true)
 |-- dload: double (nullable = true)
 |-- sloss: long (nullable = true)
 |-- dloss: long (nullable = true)
 |-- sinpkt: double (nullable = true)
 |-- dinpkt: double (nullable = true)
 |-- sjit: double (nullable = true)
 |-- djit: double (nullable = true)
 |-- swin: long (nullable = true)
 |-- stcpb: long (nullable = true)
 |-- dtcpb: long (nullable = true)
 |-- dwin: long (nullable = true)
 |-- tcprtt: double (nullable = true)
 |-- synack: double (nullable = true)
 |-- ackdat: double (nullable = true)
 |-- smean: long (nullable = true)
 |-- dmean: long (nul