<a href="https://colab.research.google.com/github/Raaghashree/Pattern-Based-Insider-Threat-Detection-Using-Machine-Learning/blob/main/Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


ML BASED RESULTS

In [None]:
import pandas as pd
import numpy as np
import ipaddress
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE

def supervised_cleaning(file_path="/content/drive/MyDrive/Dataset/CTDAPD_Dataset.csv"):
    # dataset
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()
    # Step 1: Handle missing values
    df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.mean()) if col.isnull().sum() > 0 else col)

    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]) if col.isnull().sum() > 0 else col)

    # Step 2: Remove duplicates
    df.drop_duplicates(inplace=True)

    # Step 3: Handle non-numeric columns
    non_numeric_cols = df.select_dtypes(include=['object']).columns.tolist()
    keep_columns = ['Label', 'Source_IP', 'Destination_IP', 'Protocol_Type', 'System_Patch_Status']
    cols_to_drop = [col for col in non_numeric_cols if col not in keep_columns]
    df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    # Convert IPs to integers
    if 'Source_IP' in df.columns:
        df['Source_IP_int'] = df['Source_IP'].apply(lambda ip: int(ipaddress.IPv4Address(ip)))
    if 'Destination_IP' in df.columns:
        df['Destination_IP_int'] = df['Destination_IP'].apply(lambda ip: int(ipaddress.IPv4Address(ip)))
    df.drop(columns=['Source_IP', 'Destination_IP'], inplace=True, errors='ignore')

    # Label encode 'Protocol_Type' and 'System_Patch_Status'
    if 'Protocol_Type' in df.columns:
        proto_encoder = LabelEncoder()
        df['Protocol_Type'] = proto_encoder.fit_transform(df['Protocol_Type'])

    if 'System_Patch_Status' in df.columns:
        patch_encoder = LabelEncoder()
        df['System_Patch_Status'] = patch_encoder.fit_transform(df['System_Patch_Status'])

    # Encode target label
    if 'Label' in df.columns:
        if df['Label'].dtype == 'object':
            encoder = LabelEncoder()
            df['Label'] = encoder.fit_transform(df['Label'])
    else:
        raise ValueError("Target column 'Label' not found in dataset")

    # Separate features and target
    X = df.drop(columns=['Label'])
    y = df['Label']

    # Handle inf values (specific column handling)
    col = 'Normalized_Packet_Flow'
    if col in X.columns:
        mean_val = X[col][~np.isinf(X[col])].mean()
        X[col] = X[col].replace([np.inf, -np.inf], mean_val)

    # Handle class imbalance using SMOTE
    class_counts = y.value_counts()
    imbalance_ratio = class_counts.min() / class_counts.max()

    if imbalance_ratio < 0.5:
        smote = SMOTE(sampling_strategy=1.0, random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)
    else:
        X_resampled, y_resampled = X, y

    # Normalize features
    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X_resampled)
    df_processed = pd.DataFrame(X_normalized, columns=X.columns)
    df_processed['Label'] = y_resampled

    return df_processed


def unsupervised_cleaning(file_path="/content/drive/MyDrive/Dataset/CTDAPD_Dataset.csv"):
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully: {df.shape}")
    except FileNotFoundError:
        print("Error: File not found.")
        return None

    # Step 1: Clean Column Names
    df.columns = df.columns.str.strip()

    # Step 2: Handle Missing Values
    missing_percentage = df.isnull().mean() * 100
    print("Missing Values:", missing_percentage[missing_percentage > 0].to_dict())

    df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)  # Drop columns with >50% missing
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.mean()) if col.isnull().sum() > 0 else col)

    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]) if col.isnull().sum() > 0 else col)

    # Step 3: Remove Duplicates
    duplicates = df.duplicated().sum()
    print(f"Duplicate rows removed: {duplicates}")
    df.drop_duplicates(inplace=True)

    # Step 4: Handle Non-Numeric Columns
    non_numeric_cols = df.select_dtypes(include=['object']).columns.tolist()
    keep_columns = ['Source_IP', 'Destination_IP', 'Protocol_Type', 'System_Patch_Status']
    cols_to_drop = [col for col in non_numeric_cols if col not in keep_columns]
    df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    # Convert IPs to integers
    if 'Source_IP' in df.columns:
        df['Source_IP_int'] = df['Source_IP'].apply(lambda ip: int(ipaddress.IPv4Address(ip)))
    if 'Destination_IP' in df.columns:
        df['Destination_IP_int'] = df['Destination_IP'].apply(lambda ip: int(ipaddress.IPv4Address(ip)))
    df.drop(columns=['Source_IP', 'Destination_IP'], inplace=True, errors='ignore')

    # Encode categorical fields
    if 'Protocol_Type' in df.columns:
        df['Protocol_Type'] = LabelEncoder().fit_transform(df['Protocol_Type'])

    if 'System_Patch_Status' in df.columns:
        df['System_Patch_Status'] = LabelEncoder().fit_transform(df['System_Patch_Status'])

    # Step 5: Drop Label Column (for unsupervised)
    if 'Label' in df.columns:
        df.drop(columns=['Label'], inplace=True)
        print("Dropped 'Label' column for unsupervised learning")

    # Step 6: Replace inf/-inf values
    if 'Normalized_Packet_Flow' in df.columns:
        col = 'Normalized_Packet_Flow'
        mean_val = df[col][~np.isinf(df[col])].mean()
        df[col] = df[col].replace([np.inf, -np.inf], mean_val)

    # Step 7: Normalize Features
    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(df)
    df_processed = pd.DataFrame(X_normalized, columns=df.columns)

    print("Dataset ready for unsupervised learning!")
    print(f"Final shape: {df_processed.shape}")

    return df_processed


In [None]:
UL_df = unsupervised_cleaning()
SL_df = supervised_cleaning()

Dataset loaded successfully: (54768, 30)
Missing Values: {'Attack_Vector': 80.03761320479111, 'Botnet_Family': 89.87547472976921, 'Malware_Type': 94.1261320479112}
Duplicate rows removed: 0
Dataset ready for unsupervised learning!
Final shape: (54768, 24)


In [None]:
from google.colab import files
uploaded = files.upload()

Saving xgboost_cyber_model.pkl to xgboost_cyber_model (1).pkl
Saving user_behavior_analysis.py to user_behavior_analysis (1).py
Saving autoencoder_cyber_model.pkl to autoencoder_cyber_model (1).pkl


In [None]:
import joblib
SL_model = joblib.load("xgboost_cyber_model.pkl")
UL_model = joblib.load("autoencoder_cyber_model.pkl")

In [None]:
import pandas as pd
import numpy as np

# Sample 1000 entries from your cleaned dataset
SL_sample = SL_df.sample(n=1000, random_state=42)
X_SL = SL_sample.drop(columns=['Label'])
y_SL = SL_sample['Label']

# --- Supervised prediction using your saved XGBoost model ---
SL_preds = SL_model.predict(X_SL)
SL_probs = SL_model.predict_proba(X_SL)[:, 1]  # Probability of being malicious

# --- Unsupervised prediction using autoencoder ---
reconstructions = UL_model.predict(X_SL)
mse = np.mean(np.power(X_SL - reconstructions, 2), axis=1)

# Set threshold (you can also compute dynamically)
threshold = np.percentile(mse, 95)
UL_preds = (mse > threshold).astype(int)

# --- Combine everything ---
results_df = X_SL.copy()
results_df['True_Label'] = y_SL.values
results_df['SL_Prediction'] = SL_preds
results_df['SL_Prob_Malicious'] = SL_probs
results_df['UL_MSE'] = mse
results_df['UL_Prediction'] = UL_preds

# --- Alerts (predicted as attack by either model) ---
alerts_df = results_df[(results_df['SL_Prediction'] == 1) | (results_df['UL_Prediction'] == 1)]

# Done!
print("✅ Total Predictions:", results_df.shape)
print("⚠️ Alerts Found:", alerts_df.shape)



[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
✅ Total Predictions: (1000, 29)
⚠️ Alerts Found: (587, 29)


In [None]:
results_df.head()

Unnamed: 0,Source_Port,Destination_Port,Protocol_Type,Flow_Duration,Packet_Size,Flow_Bytes_per_s,Flow_Packets_per_s,Total_Forward_Packets,Total_Backward_Packets,Packet_Length_Mean_Forward,...,System_Patch_Status,Normalized_Packet_Flow,Anomaly_Severity_Index,Source_IP_int,Destination_IP_int,True_Label,SL_Prediction,SL_Prob_Malicious,UL_MSE,UL_Prediction
33685,0.346303,0.052246,0.5,0.047216,0.453762,0.154689,0.245127,0.372093,0.375,0.57159,...,1.0,0.009448,0.057808,0.0,0.428571,1,0,0.444128,0.000142,0
44571,0.922663,0.007198,0.5,0.102185,0.605016,0.102316,0.220945,0.44186,0.53125,0.560883,...,0.0,0.005185,0.066194,1.0,1.0,1,1,0.780519,0.000108,0
61124,0.573539,0.052246,0.5,0.045807,0.423197,0.12287,0.164018,0.186047,0.28125,0.48764,...,1.0,0.007012,0.062102,1.0,0.428571,0,1,0.52611,0.000137,0
67303,0.183692,0.003847,0.5,0.003524,0.460815,0.034782,0.251058,0.348837,0.375,0.594301,...,1.0,0.045939,0.17934,0.887254,0.428571,0,0,0.027935,0.000374,0
76929,0.863742,0.052246,0.5,0.059197,0.657524,0.116043,0.19111,0.325581,0.5,0.59416,...,1.0,0.112231,0.230264,1.0,1.0,0,0,0.011194,0.00045,1


In [None]:
alerts_df.head()

Unnamed: 0,Source_Port,Destination_Port,Protocol_Type,Flow_Duration,Packet_Size,Flow_Bytes_per_s,Flow_Packets_per_s,Total_Forward_Packets,Total_Backward_Packets,Packet_Length_Mean_Forward,...,System_Patch_Status,Normalized_Packet_Flow,Anomaly_Severity_Index,Source_IP_int,Destination_IP_int,True_Label,SL_Prediction,SL_Prob_Malicious,UL_MSE,UL_Prediction
44571,0.922663,0.007198,0.5,0.102185,0.605016,0.102316,0.220945,0.44186,0.53125,0.560883,...,0.0,0.005185,0.066194,1.0,1.0,1,1,0.780519,0.000108,0
61124,0.573539,0.052246,0.5,0.045807,0.423197,0.12287,0.164018,0.186047,0.28125,0.48764,...,1.0,0.007012,0.062102,1.0,0.428571,0,1,0.52611,0.000137,0
76929,0.863742,0.052246,0.5,0.059197,0.657524,0.116043,0.19111,0.325581,0.5,0.59416,...,1.0,0.112231,0.230264,1.0,1.0,0,0,0.011194,0.00045,1
10054,0.58377,0.007198,0.5,0.004228,0.612853,0.088738,0.174825,0.488372,0.34375,0.543125,...,1.0,0.12228,0.097208,1.0,1.0,1,1,0.699353,0.000205,0
57030,0.682871,0.052246,0.5,0.047216,0.520376,0.083603,0.196765,0.325581,0.46875,0.404872,...,1.0,0.009926,0.257547,1.0,1.0,0,0,0.13148,0.000444,1


RULE BASED RESULTS

In [None]:
logon_path = '/content/drive/MyDrive/Dataset/logon.csv'
device_path = '/content/drive/MyDrive/Dataset/device.csv'
http_path = '/content/drive/MyDrive/Dataset/http.csv'
ldap_path = '/content/drive/MyDrive/Dataset/LDAP_2009-12.csv'
rule_based = '/content/drive/MyDrive/user_behaviour_analysis.py'

In [None]:
uploaded.keys()

dict_keys(['xgboost_cyber_model (1).pkl', 'user_behavior_analysis (1).py', 'autoencoder_cyber_model (1).pkl'])

In [None]:
import sys
sys.path.append('/content')

from user_behavior_analysis import analyze_user_behavior

In [None]:
RL_df, RL_alerts_df = analyze_user_behavior(logon_path, device_path, http_path, ldap_path)

In [None]:
RL_alerts_df.head()

Unnamed: 0,date,user_id,pc,activity,role,alert,log_type
892,2010-01-06 07:44:24,DTAA/NBB0501,PC-0457,http://thepiratebay.org,,Suspicious URL Accessed,http
1094,2010-01-06 11:25:51,DTAA/KCN0107,PC-2095,http://cracked.com,,Suspicious URL Accessed,http
1258,2010-01-06 12:20:01,DTAA/SLG0008,PC-1682,http://thepiratebay.org,,Suspicious URL Accessed,http
1773,2010-01-07 09:50:44,DTAA/TUA0298,PC-3426,http://crack-grey-hospital.net,,Suspicious URL Accessed,http
2523,2010-01-07 12:20:45,DTAA/ABB0272,PC-0726,http://imageshack.us,,Suspicious URL Accessed,http


In [None]:
results_df.to_csv("results_df.csv", index=False)
alerts_df.to_csv("alerts_df.csv", index=False)
RL_df_small = RL_df.tail(1000)
RL_df_small.to_csv("RL_df.csv", index=False)
RL_alerts_df_small = RL_alerts_df.tail(1000)
RL_alerts_df.to_csv("RL_alerts_df.csv", index=False)

DASHBOARD

In [None]:
!pkill streamlit
!pip install -q streamlit pyngrok

In [None]:
!ngrok config add-authtoken 2vX4pb79CiRJ6lMQ8JbonlEW1LW_7YUqG7fWdVjz5iWdAmKnD

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import plotly.express as px

st.set_page_config(layout="wide")
st.title("Insider Threat Detection Dashboard")

# Load DataFrames
results_df = pd.read_csv("results_df.csv")
alerts_df = pd.read_csv("alerts_df.csv")
RL_df = pd.read_csv("RL_df.csv")
RL_alerts_df = pd.read_csv("RL_alerts_df.csv")

# Create Tabs
tab1, tab2 = st.tabs(["ML-Based Detection", "Rule-Based Detection"])

# ========================= ML TAB =========================
with tab1:
    st.header("Machine Learning-Based Detection")

    # Categorize alerts by severity
    alerts_df['Severity'] = pd.cut(alerts_df['SL_Prob_Malicious'], bins=[0, 0.4, 0.7, 1],
                                   labels=['Low', 'Medium', 'High'])

    # Metrics/KPI Cards
    total_alerts = len(alerts_df)
    high_alerts = (alerts_df['Severity'] == 'High').sum()
    medium_alerts = (alerts_df['Severity'] == 'Medium').sum()
    low_alerts = (alerts_df['Severity'] == 'Low').sum()

    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Total Alerts", total_alerts)
    col2.metric("High", high_alerts)
    col3.metric("Medium", medium_alerts)
    col4.metric("Low", low_alerts)

    st.subheader("Anomaly Severity Over Time")
    if 'Date' in results_df.columns:
        results_df['Date'] = pd.to_datetime(results_df['Date'])
        timeline = results_df.resample('1H', on='Date').agg({
            'SL_Prob_Malicious': 'mean',
            'Anomaly_Severity_Index': 'max'
        }).reset_index()

        fig_line = px.line(
            timeline, x='Date', y='SL_Prob_Malicious',
            title='Average Malicious Probability Over Time',
            labels={'SL_Prob_Malicious': 'Avg Malicious Probability'}
        )
        st.plotly_chart(fig_line, use_container_width=True)

    st.subheader("All Predictions")
    st.dataframe(results_df, use_container_width=True)

    st.subheader("Alerts Only (High Confidence)")
    st.dataframe(alerts_df, use_container_width=True)

    # Donut Chart for severity
    severity_count = alerts_df['Severity'].value_counts().reset_index()
    severity_count.columns = ['Severity', 'Count']

    fig_ml_donut = px.pie(
        severity_count, names='Severity', values='Count',
        hole=0.5, title="ML Alert Severity Distribution",
        color_discrete_map={'Low': '#2ECC40', 'Medium': '#FF851B', 'High': '#FF4136'}
    )
    st.plotly_chart(fig_ml_donut, use_container_width=True)

    # Protocol-Based Analysis
    st.subheader("Protocol-Based Insights")

    protocol_map = {0: 'TCP', 0.5: 'UDP', 1: 'FTP'}
    results_df['Protocol_Label'] = results_df['Protocol_Type'].map(protocol_map)

    # Count of entries per protocol
    protocol_counts = results_df['Protocol_Label'].value_counts().reset_index()
    protocol_counts.columns = ['Protocol', 'Count']

    fig_protocol_bar = px.bar(
        protocol_counts,
        x='Protocol',
        y='Count',
        title='Number of Events per Protocol',
        color='Protocol',
        color_discrete_sequence=px.colors.qualitative.Vivid
    )
    st.plotly_chart(fig_protocol_bar, use_container_width=True)

    # Average anomaly score by protocol
    protocol_anomaly_avg = results_df.groupby('Protocol_Label')['Anomaly_Score'].mean().reset_index()

    fig_anomaly_bar = px.bar(
        protocol_anomaly_avg,
        x='Protocol_Label',
        y='Anomaly_Score',
        title='Average Anomaly Score by Protocol',
        color='Protocol_Label',
        color_discrete_sequence=px.colors.qualitative.Plotly
    )
    st.plotly_chart(fig_anomaly_bar, use_container_width=True)

# ========================= RULE-BASED TAB =========================
with tab2:
    st.header("Rule-Based Detection")

    # Metric Cards
    st.subheader("Overview Metrics")
    col1, col2 = st.columns(2)
    total_logs = len(RL_df)
    total_alerts = len(RL_alerts_df)
    col1.metric("Total Logs", total_logs)
    col2.metric("Total Alerts", total_alerts)

    st.subheader("Sample of Rule-Based Logs")
    st.dataframe(RL_df.sample(min(len(RL_df), 1000)), use_container_width=True)

    st.subheader("Alerts Only")
    st.dataframe(RL_alerts_df, use_container_width=True)



    # Events Over Time by log_type
    st.subheader("Events Over Time")
    RL_df['date'] = pd.to_datetime(RL_df['date'])
    events_by_type = RL_df.groupby([pd.Grouper(key='date', freq='1D'), 'log_type']).size().reset_index(name='Count')

    fig_events_time = px.line(
        events_by_type, x='date', y='Count', color='log_type',
        title='Log Events Over Time by Type',
        labels={'date': 'Date', 'Count': 'Log Count'}
    )
    st.plotly_chart(fig_events_time, use_container_width=True)

    # After-Hours Activity Trend
    st.subheader("After-Hours Activity Trend (6 PM – 6 AM)")
    RL_df['hour'] = RL_df['date'].dt.hour
    after_hours = RL_df[
        ((RL_df['hour'] >= 18) | (RL_df['hour'] < 6)) &
        (RL_df['log_type'].isin(['logon', 'http']))
    ]
    after_hours_grouped = after_hours.groupby([pd.Grouper(key='date', freq='1D'), 'log_type']).size().reset_index(name='Count')

    fig_after_hours = px.line(
        after_hours_grouped, x='date', y='Count', color='log_type',
        title="After-Hours Activity by Type",
        labels={'date': 'Date', 'Count': 'Log Count'}
    )
    st.plotly_chart(fig_after_hours, use_container_width=True)

    # User Activity Timeline
    st.subheader("User Activity Timeline")
    user_activity = RL_df.groupby(['user_id', pd.Grouper(key='date', freq='1D')]).size().reset_index(name='Count')

    fig_user_activity = px.line(
        user_activity, x='date', y='Count', color='user_id',
        title="Daily User Activity Timeline",
        labels={'date': 'Date', 'Count': 'Log Count'}
    )
    st.plotly_chart(fig_user_activity, use_container_width=True)


Overwriting app.py


In [None]:
!pkill ngrok
!pkill streamlit

In [None]:
from pyngrok import ngrok

# Auth
ngrok.set_auth_token("2vX4pb79CiRJ6lMQ8JbonlEW1LW_7YUqG7fWdVjz5iWdAmKnD")

public_url = ngrok.connect("http://localhost:8501")
print("Streamlit URL:", public_url)

# Launch Streamlit silently
!streamlit run app.py &> /dev/null &


ModuleNotFoundError: No module named 'pyngrok'