<a href="https://colab.research.google.com/github/Raaghashree/Pattern-Based-Insider-Threat-Detection-Using-Machine-Learning/blob/main/XGBoost_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import ipaddress
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE

# Load dataset
try:
    df = pd.read_csv("unified_logs_dataset.csv")
    print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns")
except FileNotFoundError:
    print("Error: 'Malwaredata1.csv' file not found. Please check the file path.")
    exit(1)
# Step 1: Clean Column Names
df.columns = df.columns.str.strip()

# Step 2: Handle Missing Values
missing_percentage = df.isnull().mean() * 100
print(f"Columns with missing values: {missing_percentage[missing_percentage > 0].to_dict()}")

# Drop columns with >50% missing values
df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

# Fill numeric columns with mean, non-numeric with mode
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.mean()) if col.isnull().sum() > 0 else col)

categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]) if col.isnull().sum() > 0 else col)

# Step 3: Remove Duplicate Rows
duplicates = df.duplicated().sum()
print(f"Found {duplicates} duplicate rows")
df.drop_duplicates(inplace=True)

# Step 4: Handle Non-Numeric Columns
# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Original non-numeric columns:", non_numeric_cols)

# Specify columns to keep among non-numeric: 'Label', 'Source_IP', 'Destination_IP', 'Protocol_Type', 'System_Patch_Status'
keep_columns = ['Label', 'Source_IP', 'Destination_IP', 'Protocol_Type', 'System_Patch_Status']
# Drop all non-numeric columns that are not in keep_columns
cols_to_drop = [col for col in non_numeric_cols if col not in keep_columns]
print(f"Non-numeric columns that will be dropped: {cols_to_drop}")
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Convert IP addresses into numeric form
if 'Source_IP' in df.columns:
    df['Source_IP_int'] = df['Source_IP'].apply(lambda ip: int(ipaddress.IPv4Address(ip)))
if 'Destination_IP' in df.columns:
    df['Destination_IP_int'] = df['Destination_IP'].apply(lambda ip: int(ipaddress.IPv4Address(ip)))
# Drop original IP columns
df.drop(columns=['Source_IP', 'Destination_IP'], inplace=True)

# Encode 'Protocol_Type' using LabelEncoder
if 'Protocol_Type' in df.columns:
    proto_encoder = LabelEncoder()
    df['Protocol_Type'] = proto_encoder.fit_transform(df['Protocol_Type'])
    print("Encoded 'Protocol_Type'.")

# ✅ Encode 'System_Patch_Status' using LabelEncoder
if 'System_Patch_Status' in df.columns:
    print(f"Encoding 'System_Patch_Status'. Original values: {df['System_Patch_Status'].unique()}")
    patch_encoder = LabelEncoder()
    df['System_Patch_Status'] = patch_encoder.fit_transform(df['System_Patch_Status'])
    print(f"Encoded values: {df['System_Patch_Status'].unique()}")

# Step 5: Encode Target Variable
if 'Label' in df.columns:
    if df['Label'].dtype == 'object':
        print(f"Encoding target variable. Original values: {df['Label'].unique()}")
        encoder = LabelEncoder()
        df['Label'] = encoder.fit_transform(df['Label'])
        print(f"Encoded values: {df['Label'].unique()}")
else:
    print("Error: Target column 'Label' not found in dataset")
    exit(1)

# Step 6: Ensure All Features Are Numeric
X = df.drop(columns=['Label'])
y = df['Label']

# 💡 Function to clean inf values in a specific column
def clean_feature_column(X):
    col = 'Normalized_Packet_Flow'
    if col in X.columns:
        mean_val = X[col][~np.isinf(X[col])].mean()
        X[col] = X[col].replace([np.inf, -np.inf], mean_val)
    return X

# ✅ Apply the fix for 'inf' values
X = clean_feature_column(X)

# Step 7: Handle Class Imbalance using SMOTE
class_counts = y.value_counts()
print(f"Class distribution before SMOTE: {class_counts.to_dict()}")
imbalance_ratio = class_counts.min() / class_counts.max()
print(f"Class imbalance ratio: {imbalance_ratio:.2f}")

if imbalance_ratio < 0.5:
    print("Applying SMOTE to balance classes...")
    smote = SMOTE(sampling_strategy=1.0, random_state=42)  # Fully balance classes
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(f"Data shape after SMOTE: {X_resampled.shape}")
else:
    print("Class balance is acceptable, skipping SMOTE")
    X_resampled, y_resampled = X, y

# Convert back to DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced['Label'] = y_resampled

# Check new class distribution
print(f"Class distribution after SMOTE: {df_balanced['Label'].value_counts().to_dict()}")

# Step 8: Normalize Features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(df_balanced.drop(columns=['Label']))
df_processed = pd.DataFrame(X_normalized, columns=X.columns)
df_processed['Label'] = df_balanced['Label']

# Save the cleaned and preprocessed dataset
df_processed.to_csv("Processed_CTDAPD_Dataset.csv", index=False)
print("\nData Preprocessing Complete! Cleaned dataset saved as 'Processed_CTDAPD_Dataset.csv'.")
print(f"Final dataset shape: {df_processed.shape}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(thresh=df.shape[0] * 0.5, axis=1, inplace=True)




AssertionError: Non-numeric data found in columns: ['log_type', 'access_label_warning_access', 'is_weekend_True', 'is_night_True', 'is_off_hours_True', 'action_create', 'action_delete', 'action_deployment_trigger', 'action_exec', 'action_failed_login', 'action_file_access', 'action_file_modify', 'action_login', 'action_logout', 'action_scale', 'action_sudo_command']

new

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import ipaddress
from sklearn.preprocessing import LabelEncoder

# Helper functions
def safe_ip_to_int(ip):
    try:
        return int(ipaddress.IPv4Address(ip))
    except:
        return 0

def hash_encode(x, modulo=10000):
    return hash(x) % modulo

# Load dataset
df = pd.read_csv("unified_logs_dataset.csv")

# Step 1: Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Step 2: Drop duplicate rows
df = df.drop_duplicates()

# Step 3: Drop columns with >50% missing values
df.dropna(thresh=df.shape[0] * 0.5, axis=1, inplace=True)

# Step 4: Handle missing values
if 'suspicious' in df.columns:
    df = df.dropna(subset=['suspicious'])

numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.mean()))

categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(
    lambda col: col.fillna(col.mode().iloc[0]) if not col.mode().empty else col.fillna("missing")
)

# Step 5: Encode IP address
if 'ip' in df.columns:
    df['ip_int'] = df['ip'].apply(safe_ip_to_int)
    df.drop(columns=['ip'], inplace=True)

# Step 6: Timestamp Feature Engineering
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'] >= 5
    df['is_night'] = df['hour'].between(0, 6)
    df['is_off_hours'] = ~df['hour'].between(8, 18)

    def label_access(row):
        if row['is_weekend'] or row['is_night'] or row['is_off_hours']:
            return 'warning_access'
        return 'normal_access'

    df['access_label'] = df.apply(label_access, axis=1)
    df.drop(columns=['timestamp'], inplace=True)

# Step 7: Encode categorical features
high_card_cols = ['user', 'resource', 'container']
low_card_cols = ['action', 'protocol', 'pipeline']

# Label Encoding for low-cardinality categorical features
le = LabelEncoder()

for col in low_card_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col].astype(str))

# Hash Encoding for high-cardinality categorical features
for col in high_card_cols:
    if col in df.columns:
        df[col + '_hash'] = df[col].apply(lambda x: hash_encode(x))
        df.drop(columns=[col], inplace=True)

# Step 8: Encode 'log_type' and 'access_label' as necessary
for col in ['log_type', 'access_label']:
    if col in df.columns:
        df[col] = le.fit_transform(df[col].astype(str))

# Step 9: Encode 'suspicious' if needed
if 'suspicious' in df.columns and df['suspicious'].dtype != 'int64':
    df['suspicious'] = pd.factorize(df['suspicious'])[0]

# Step 10: Drop irrelevant columns
df.drop(columns=[col for col in ['session_id'] if col in df.columns], inplace=True)

# Step 11: Ensure all features are numeric
# Ensure that all columns are numeric
df = df.apply(pd.to_numeric, errors='ignore')

# Check for non-numeric columns
non_numeric_cols = [col for col in df.columns if not np.issubdtype(df[col].dtype, np.number)]
if non_numeric_cols:
    print(f"Non-numeric columns found: {non_numeric_cols}")
else:
    print("All features are numeric.")

# Step 12: Separate features and target
X = df.drop("suspicious", axis=1)
y = df["suspicious"].astype(int)

# Step 13: Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 14: Normalize features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Step 15: Save cleaned dataset
cleaned_df = pd.DataFrame(X_scaled, columns=X.columns)
cleaned_df['suspicious'] = y_resampled
cleaned_df.to_csv("preprocessed_dataset.csv", index=False)

print("✅ Full preprocessing complete. Output saved to 'preprocessed_dataset.csv'.")


  df = df.apply(pd.to_numeric, errors='ignore')


Non-numeric columns found: ['is_weekend', 'is_night', 'is_off_hours']
✅ Full preprocessing complete. Output saved to 'preprocessed_dataset.csv'.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load the dataset
df = pd.read_csv("unified_logs_dataset_preprocessed.csv")  # Replace with the actual file path

# Step 2: Rename or assign to df_processed if needed
df_processed = df.copy()

# Step 3: Separate features and label
X = df_processed.drop(columns=['suspicious'])  # 'suspicious' is your label column
y = df_processed['suspicious']

# Step 4: Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}, Training labels: {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, Testing labels: {y_test.shape}")


Training set shape: (23995, 22), Training labels: (23995,)
Testing set shape: (5999, 22), Testing labels: (5999,)


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Step 1: Feature & Label separation
X = df_processed.drop(columns=['suspicious'])  # <-- correct column name
y = df_processed['suspicious']

# Step 2: Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 3: Initialize and Train XGBoost Classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Step 4: Predict
y_pred = xgb_model.predict(X_test)

# Step 5: Evaluation
print("\n✅ XGBoost Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 6: Save model
joblib.dump(xgb_model, "xgboost_unified_logs_dataset_preprocessed.pkl")
print("\n🚀 Trained XGBoost model saved as 'unified_logs_dataset_preprocessed.pkl'")

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:session_id: object, log_type: object

In [None]:
import joblib
...
# Step 6: Save model
joblib.dump(xgb_model, "xgboost_cyber_model.pkl")
print("\n🚀 Trained XGBoost model saved as 'xgboost_cyber_model.pkl'")



🚀 Trained XGBoost model saved as 'xgboost_cyber_model.pkl'


In [None]:
from google.colab import files
files.download("xgboost_cyber_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

NEW


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def prepare_for_xgboost(df):
    df = df.copy()

    # Step 1: Drop columns not useful for model training
    df.drop(columns=['username', 'source_ip'], inplace=True)

    # Step 2: Extract datetime features from 'timestamp'
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df.drop(columns=['timestamp'], inplace=True)

    # Step 3: Fill NA and encode categorical features
    cat_cols = ['user_id', 'k8s_action', 'auth_event', 'cicd_action', 'time_of_day', 'ip_category']
    le_dict = {}

    for col in cat_cols:
        df[col] = df[col].fillna('none')
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        le_dict[col] = le  # Save encoders for future decoding if needed

    # Step 4: Ensure all columns are numeric
    df = df.apply(pd.to_numeric)

    return df, le_dict


In [None]:
df = pd.read_csv('/content/drive/My Drive/Dataset/FOR MODELS/insider_threat_detection_dataset.csv')
processed_df, encoders = prepare_for_xgboost(df)

# Split for XGBoost training
X = processed_df.drop(columns=['label'])
y = processed_df['label']


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# ------------------------
# Step 1: Load the dataset
# ------------------------
df = pd.read_csv('/content/drive/My Drive/Dataset/FOR MODELS/insider_threat_detection_dataset.csv')

# ------------------------
# Step 2: Preprocess it
'''
# ------------------------
def prepare_for_xgboost(df):

    df = df.copy()

    # Drop irrelevant columns
    df.drop(columns=['username', 'source_ip'], inplace=True)

    # Extract datetime features
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df.drop(columns=['timestamp'], inplace=True)

    # Encode categorical columns
    cat_cols = ['user_id', 'k8s_action', 'auth_event', 'cicd_action', 'time_of_day', 'ip_category']
    for col in cat_cols:
        df[col] = df[col].fillna('none')
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    # Make sure all are numeric
    df = df.apply(pd.to_numeric)

    return df
'''
import numpy as np
from scipy.stats import zscore

def prepare_for_xgboost(df):
    df = df.copy()

    # Drop irrelevant
    df.drop(columns=['username', 'source_ip'], inplace=True)

    # Handle missing timestamps and convert
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df['hour'] = df['timestamp'].dt.hour.fillna(-1)
    df['day_of_week'] = df['timestamp'].dt.dayofweek.fillna(-1)
    df.drop(columns=['timestamp'], inplace=True)

    # Encode user_id by frequency
    df['user_id_freq'] = df['user_id'].map(df['user_id'].value_counts())
    df.drop(columns=['user_id'], inplace=True)

    # One-hot encode low-cardinality categorical features
    low_card_cat_cols = ['k8s_action', 'auth_event', 'cicd_action', 'time_of_day', 'ip_category']
    for col in low_card_cat_cols:
        df[col] = df[col].fillna('none')
    df = pd.get_dummies(df, columns=low_card_cat_cols, dummy_na=True)

    # Temporal features
    df['is_night'] = df['hour'].apply(lambda x: 1 if x < 6 or x > 22 else 0)
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['hour_day_interaction'] = df['hour'] * df['day_of_week']

    # Group-based statistics (e.g., number of records per user frequency)
    df['records_per_user_freq'] = df['user_id_freq'].map(df['user_id_freq'].value_counts())

    # Normalize user activity
    df['user_activity_ratio'] = df['user_id_freq'] / df['user_id_freq'].sum()

    # Z-score anomaly proxy (for any numerical feature with wide range)
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].nunique() > 10:  # Avoid one-hot and binary columns
            df[f'{col}_zscore'] = zscore(df[col].fillna(0))

    # Optional: log-transform skewed columns
    skewed_cols = ['user_id_freq', 'records_per_user_freq']
    for col in skewed_cols:
        df[f'{col}_log'] = np.log1p(df[col])

    # Ensure all numeric
    df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

    return df

df_processed = prepare_for_xgboost(df)

# ------------------------
# Step 3: Prepare features and label
# ------------------------
X = df_processed.drop(columns=['label'])
y = df_processed['label']

# ------------------------
# Step 4: Train-test split
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------
# Step 5: Train XGBoost
# ------------------------
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# ------------------------
# Step 6: Evaluate
# ------------------------
y_pred = xgb_model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[154 140]
 [149 157]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.52      0.52       294
           1       0.53      0.51      0.52       306

    accuracy                           0.52       600
   macro avg       0.52      0.52      0.52       600
weighted avg       0.52      0.52      0.52       600


Accuracy Score: 0.5183333333333333


Parameters: { "use_label_encoder" } are not used.



joblib


In [None]:
import joblib
...
# Step 6: Save model
joblib.dump(xgb_model, "xgboost_model.pkl")
print("\n🚀 Trained XGBoost model saved as 'xgboost_model.pkl'")



🚀 Trained XGBoost model saved as 'xgboost_model.pkl'


In [None]:
from google.colab import files
files.download("xgboost_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>