<a href="https://colab.research.google.com/github/Saltizm/Data-mining-group-project/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sklearn as sk;
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd;
import numpy as np;
import seaborn as sns;
import matplotlib.pyplot as plt;
import tqdm;
import os;
import sys;

In [None]:
# # # mount
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
if os.path.isdir(r'C:\Users\User\Desktop\Data-mining-group-project'):
    os.chdir(r'C:\Users\User\Desktop\Data-mining-group-project') #local
elif os.path.isdir(r'/content/drive/MyDrive/data mining'):
    os.chdir(r'/content/drive/MyDrive/data mining') #google drive
try:
    train = pd.read_csv(open('UNSW_NB15_training-set.csv', encoding='utf-8'))
    test = pd.read_csv(open('UNSW_NB15_testing-set.csv', encoding='utf-8'))
except FileNotFoundError as e:
    print(f"error: {e}\nTry changing the training data directory in 'os.chdir'")

In [None]:
# Set parameters
OUTLIER_THRESHOLD = 0.1
FEATURE_THRESHOLD = 0.5
BIN_THRESHOLD = 0.1

In [None]:
np.random.seed(42)

In [None]:
# if data too big (use for testing)
train = train.sample(frac=0.1)
test = test.sample(frac=0.1)

In [None]:
print(train.shape, test.shape)
# 45 attributes

In [None]:
train.isnull().any(axis=1).sum()

In [None]:
test.isnull().any(axis=1).sum()

In [None]:
# import files
train = train.sample(frac=1)
test = test.sample(frac=1)
x_train, y_train = train.iloc[:, :-2], train.iloc[:, -2:]
x_test, y_test = test.iloc[:, :-2], test.iloc[:, -2:]

In [None]:
y_train.columns

In [None]:
def show_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (lower_bound, upper_bound)

# data cleaning (replacement)
def replace_outliers_iqr(df, cols):
    for col in cols:
        if col in df.columns:
            lower_bound, upper_bound = show_outliers_iqr(df[col])
            df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    return df

# data filtering
def remove_outliers_iqr(df, cols):
    overall_mask = pd.Series(True, index=df.index)

    for col in cols:
        if col in df.columns:
            lower_bound, upper_bound = show_outliers_iqr(df[col])
            col_mask = (df[col] >= lower_bound) & (df[col] <= upper_bound)
            overall_mask = overall_mask & col_mask

    df = df[overall_mask]
    return df

def risky_show_shape(*arg):
    if len(arg) == 2:
        print("train/test")
        print(arg[0].shape, arg[1].shape)
    else:
        print("x_train/x_test/y_train/y_test")
        for df in arg:
            print(df.shape)

def run_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    return y_pred

def plot_histograms(df, numerical_cols, frac=0.1):
    df = df.copy().sample(frac=frac)
    # Filter numerical_cols to only include those present in binned_x_train
    columns_to_plot = [col for col in numerical_cols if col in df.columns]

    # Calculate grid dimensions
    num_plots = len(columns_to_plot)
    num_cols = 4
    num_rows = (num_plots + num_cols - 1) // num_cols # Equivalent to ceil(num_plots / num_cols)

    plt.figure(figsize=(20, num_rows * 5)) # Adjust figure size for better readability

    for i, col in enumerate(columns_to_plot):
        ax = plt.subplot(num_rows, num_cols, i + 1) # Create subplot
        sns.histplot(df[col], kde=True, ax=ax)
        ax.set_title(f'Histogram of {col}')
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency')

    # Hide any unused subplots if the last row is not full
    for j in range(i + 1, num_rows * num_cols):
        plt.subplot(num_rows, num_cols, j + 1).set_visible(False)

    plt.tight_layout()
    plt.show()

# def bin_decision(unique_series, threshold=0.2):
#     keep = []

#     val_count = unique_series.value_counts()
#     for col in val_count.index:
#         if val_count[col] / unique_series.shape[0] > threshold:
#             keep.append(col)
#     return keep

# def binning(df, col, values):
#     df = df.copy()

#     for val in values:
#         df[col].replace(val, 'Others', inplace=True)
#     return df

class Plot(BaseEstimator, TransformerMixin):
    def __init__(self, frac=0.1):
        self.frac = frac

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        plot_histograms(X, X.columns, self.frac)
        return X

class Add_feature(BaseEstimator, TransformerMixin):
    def __init__(self, feature_1, feature_2):
        self.feature_1 = feature_1
        self.feature_2 = feature_2

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['new_feature'] = X[self.feature_1] + X[self.feature_2]
        return X

class Ratio_feature(BaseEstimator, TransformerMixin):
    def __init__(self, feature_1, feature_2):
        self.feature_1 = feature_1
        self.feature_2 = feature_2
        if self.feature_2 == 0:
            self.feature_2 = 1

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['new_feature'] = X[self.feature_1] / X[self.feature_2]
        return X

class capping(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return replace_outliers_iqr(X, self.cols)

class filtering(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return remove_outliers_iqr(X, self.cols)

class Threshold_binning(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.2, other_label="Others"):
        self.threshold = threshold
        self.other_label = other_label

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        self.keep_ = {}

        for col in X.columns:
            freq = X[col].value_counts(normalize=True)
            self.keep_[col] = freq[freq > self.threshold].index

        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()

        for col in X.columns:
            keep = self.keep_.get(col, [])
            X[col] = X[col].where(X[col].isin(keep), self.other_label)

        return X

class Pearson_feature_selection(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.5):
        self.threshold = threshold

    def fit(self, X, y):
        # Calculate correlation between each feature in X and target y
        if isinstance(X, pd.DataFrame):
            correlations = X.corrwith(y).abs()
        else:
            # Handle numpy arrays
            X_df = pd.DataFrame(X)
            y_series = pd.Series(y)
            correlations = X_df.corrwith(y_series).abs()

        # Keep features with correlation above threshold
        self.keep_ = correlations > self.threshold
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X.loc[:, self.keep_]
        else:
            # Handle numpy arrays
            return X[:, self.keep_.values]

message = """
plot_histograms(frac=0.1) -
plot a sampled subset of the data.

show_outliers_iqr(series) -
return an array where index=0 is the lower bound and index=1 is the upper bound.

replace_outliers_iqr(df, cols) -
creates a copy of "df". replace the samples with attributes > upper_bound or < lower_bound with upper_bound or lower_bound.

remove_outliers_iqr(df, cols) -
creates a copy of "df". remove the samples with attributes > upper_bound or < lower_bound.

risky_show_shape(*data) -
in the format of train/test. Or x_train, x_test, y_train, y_test.

run_model(model, x_train, y_train, x_test, y_test) -
runs the model and returns the prediction.

bin_decision(unique_series, threshold=0.2) -
receives a series df['col'].
get a list of value that represent less than x% of the total.

binning(df, col, values) -
replace a list of values with 'others' for one hot encoding later on.

---------------------------------------------------------------------------------------------------------------------------

User defined Transformer;

Plot(BaseEstimator, TransformerMixin) -
wrapper function for plot_histograms

Add_feature(BaseEstimator, TransformerMixin) -
Transformer that adds two numerical features to create a new secondary feature.

Ratio_feature(BaseEstimator, TransformerMixin) -
Transformer that divide two numerical features to create a new secondary feature.
if denominator value=0. reassign it to 1.

Capping/Filtering(BaseEstimator, TransformerMixin) -
wrapper function for replace_outliers_iqr/remove_outliers_iqr

Threshold_binning(BaseEstimator, TransformerMixin) -
bins values below a threshold into 'others'. Replacement for defunct binning and bin decision function rip

Pearson_feature_selection(BaseEstimator, TransformerMixin) -
select features based on corr coe
"""
print(message)

In [None]:
# find numerical and non numerical fields
numerical_cols = x_train.select_dtypes(include=np.number).columns
non_numerical_cols = x_train.select_dtypes(exclude=np.number).columns

In [None]:
# find continuous and uniary fields (low and high cardinality)

low_cardinality_cols = []
high_cardinality_cols = []
continuous_cols = []

# Define a threshold for low cardinality. For example, less than 20 unique values.
CARDINALITY_THRESHOLD = 50

for col in numerical_cols:
    if col in ['id', 'label']:
        continue

    unique_count = x_train[col].nunique()
    if unique_count <= CARDINALITY_THRESHOLD:
        low_cardinality_cols.append(col)
    else:
        continuous_cols.append(col)

# Non-numerical columns can also be high cardinality if they have many unique categorical values
# This step categorizes based on numerical data first.
# Let's consider non-numerical columns for high cardinality if they have many unique values
for col in non_numerical_cols:
    if col in ['id', 'label']:
        continue
    unique_count = x_train[col].nunique()
    if unique_count > CARDINALITY_THRESHOLD:
        high_cardinality_cols.append(col)
    else:
        low_cardinality_cols.append(col)

In [None]:
print(f"Continuous Columns ({len(continuous_cols)}): {continuous_cols}")
print(f"Low Cardinality Columns ({len(low_cardinality_cols)}): {low_cardinality_cols}")
print(f"High Cardinality (Non-numerical) Columns ({len(high_cardinality_cols)}): {high_cardinality_cols}")

### Histograms for Continuous Columns

In [None]:
plot_histograms(x_train, continuous_cols)

### Histograms for Low Cardinality Columns

In [None]:
plot_histograms(x_train, low_cardinality_cols)

### Histograms for High Cardinality Columns

In [None]:
plot_histograms(x_train, high_cardinality_cols)

In [None]:
# checking if there are any outliers
outlier_data = []

for col in continuous_cols:
    lower_bound, upper_bound = show_outliers_iqr(train[col])
    lower_outliers_count = (x_train[col] < lower_bound).sum()
    upper_outliers_count = (x_train[col] > upper_bound).sum()
    non_outliers_count = len(x_train[col]) - (lower_outliers_count + upper_outliers_count)

    outlier_data.append({
        'Column': col,
        'Lower Bound Outlier %': f"{lower_outliers_count/len(x_train[col]):.2f}",
        'Upper Bound Outlier %': f"{upper_outliers_count/len(x_train[col]):.2f}",
        'Non-Outlier %': f"{non_outliers_count/len(x_train[col]):.2f}"
    })

# significant outliers
outlier_df = pd.DataFrame(outlier_data)
outlier_df_high = outlier_df[outlier_df['Upper Bound Outlier %'] > f'{OUTLIER_THRESHOLD}']
outlier_df_low = outlier_df[outlier_df['Upper Bound Outlier %'] < f'{OUTLIER_THRESHOLD}']
non_outlier_df = outlier_df[outlier_df['Upper Bound Outlier %'] == '0.00']
# outlier_df

In [None]:
outlier_df

In [None]:
print('non_outliers/ outlier_high /outlier_low')
print(non_outlier_df.shape, outlier_df_high.shape, outlier_df_low.shape)

In [None]:
# Select only numerical columns before calculating correlation
corr_matrix = train.select_dtypes(include=[np.number]).corr()

# Get correlations with the 'label' column
label_correlations = corr_matrix['label'].drop('label').drop('id') # Exclude self-correlation

# Find the feature with the highest absolute correlation
top5_features = label_correlations.abs().nlargest(5)
highest_corr_value = top5_features.max()
print(f"Feature with highest absolute correlation to 'label': \n{top5_features}")
print(f"Highest correlation coefficient: {highest_corr_value:.4f}")

# 3. Plot the heatmap
plt.figure(figsize=(28, 18)) # Adjust figure size if needed
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
from itertools import combinations

pairs = combinations(top5_features.index, 2)

# Initialize temp with numerical columns from train. Correcting the TypeError here.
temp = train.select_dtypes(include=[np.number]).copy() # Use .copy() to avoid SettingWithCopyWarning later if modifying

for pair in pairs:
    feature1, feature2 = pair

    add_trans = Add_feature(feature1, feature2)
    ratio_trans = Ratio_feature(feature1, feature2)

    # Apply Add_feature and rename the new column to be unique
    transformed_add = add_trans.fit_transform(temp.copy()) # Use a copy to prevent in-place modification within transformer's fit_transform
    add_feature_name = f'{feature1}_{feature2}_add'
    temp[add_feature_name] = transformed_add['new_feature']

    # Apply Ratio_feature and rename the new column to be unique
    transformed_ratio = ratio_trans.fit_transform(temp.copy()) # Use a copy
    ratio_feature_name = f'{feature1}_{feature2}_ratio'
    temp[ratio_feature_name] = transformed_ratio['new_feature']

# After generating all new features, drop the original top5_features columns
# Correcting the inplace=True issue: assign the result back to temp.
# Use list() for columns and errors='ignore' to handle cases where top5_features.index might contain columns not in temp.
temp = temp.drop(columns=numerical_cols[:-1], errors='ignore')

# plot a coefficient matrix for temp
temp_corr_matrix = temp.corr()

# 3. Plot the heatmap
label_correlations = temp_corr_matrix['label'].drop('label') # Exclude self-correlation
top5_features = label_correlations.abs().nlargest(5)
highest_corr_value = top5_features.max()
print(f"Feature with highest absolute correlation to 'label': \n{top5_features}")
print(f"Highest correlation coefficient: {highest_corr_value:.4f}")

plt.figure(figsize=(28, 18)) # Adjust figure size if needed
sns.heatmap(temp_corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# highest correlation feature is sttl_ct_state_ttl_add (ttl_sum)
column_modifier = Pipeline([
    ("ttl_sum", Add_feature('sttl', 'ct_state_ttl')),
    ("drop_original", ColumnTransformer(
        transformers=[
            ("drop_columns", "drop", ["sttl", "ct_state_ttl"])
        ], remainder="passthrough"
    )),
    ("pearson_selection", Pearson_feature_selection(threshold=FEATURE_THRESHOLD))
])

# forming pipeline
preprocessing = Pipeline([
    ("capping", capping(high_cardinality_cols)),
    ("filtering", filtering(high_cardinality_cols)),
    ("logrithmic_transform", FunctionTransformer(np.log1p)),
    ("bin_rare", Threshold_binning(threshold=BIN_THRESHOLD)),
    ("modify_columns", column_modifier),
    ("ordinal", OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    ))
])

# run pipeline
encoded_x_train = preprocessing.fit_transform(x_train, y_train)
encoded_x_test = preprocessing.transform(x_test)

In [None]:
# !pip install xgboost #uncomment if error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

hyperparameters = {}

In [None]:
# Random Forest classifier
param_grid = {
    'max_depth': [10,30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False],

}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid, cv=cv_strategy)
grid_search.fit(encoded_x_train, y_train)
hyperparameters['Random_Forest'] = grid_search.best_estimator_

In [None]:
print(grid_search.best_params_)

In [None]:
# Logistic Regression classifier
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 200]
}

grid_search_lr = GridSearchCV(LogisticRegression(random_state=42), param_grid=param_grid_lr, cv=cv_strategy)
grid_search_lr.fit(encoded_x_train, y_train)
hyperparameters['Logistic_Regression'] = grid_search_lr.best_estimator_

print("Logistic Regression best parameters:")
print(grid_search_lr.best_params_)

In [None]:
# XGBoost Classifier
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

grid_search_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), param_grid=param_grid_xgb, cv=cv_strategy, verbose=1, n_jobs=-1)
grid_search_xgb.fit(encoded_x_train, y_train)
hyperparameters['XGBoost'] = grid_search_xgb.best_estimator_

print("XGBoost best parameters:")
print(grid_search_xgb.best_params_)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

for model_name, model_estimator in hyperparameters.items():
    print(f"Generating Confusion Matrix for {model_name}")

    # Make predictions on the encoded test set
    y_pred = model_estimator.predict(encoded_x_test)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [None]:
results = []

for model_name, model_estimator in hyperparameters.items():
    print(f"Evaluating {model_name}...")

    # Make predictions on the encoded test set
    y_pred = model_estimator.predict(encoded_x_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })

results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df.to_string(index=False))
