In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder

In [6]:
# Load the trained model and LabelEncoder
xgb_model = joblib.load('xgb_model.pkl')
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load('label_classes.npy', allow_pickle=True)  # Ensure to save this earlier during training


In [11]:
# Function to preprocess the new traffic data
def preprocess_data(file_path):
    # Load the new traffic file
    column_names = [
        'ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 
        'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 
        'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 
        'resp_pkts', 'resp_ip_bytes', 'tunnel_parents'
    ]
    df = pd.read_table(file_path, skiprows=10, names=column_names, low_memory=False)
    df.drop(df.tail(1).index, inplace=True)  # Drop the last row if it's a footer
    original_data = df.copy()  # Keep a copy of the original data
    
    # Normalize numerical features (if not normalized already)
    numerical_features = ['duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'resp_pkts']
    for feature in numerical_features:
        if feature in df.columns:
            # Convert to numeric, coercing errors (non-numeric values will become NaN)
            df[feature] = pd.to_numeric(df[feature], errors='coerce')
            # Fill NaN with the column mean to handle missing or invalid values
            df[feature].fillna(df[feature].mean(), inplace=True)
            # Standardize the values
            df[feature] = (df[feature] - df[feature].mean()) / (df[feature].std() + 1e-10)
    
    # One-hot encode categorical features
    categorical_features = ['proto', 'conn_state', 'service']
    df = pd.get_dummies(df, columns=categorical_features, drop_first=False)
    
    # Align with the training feature set
    training_features = ['duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'resp_pkts',
                         'proto_icmp', 'proto_tcp', 'proto_udp',
                         'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0',
                         'conn_state_RSTR', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 
                         'conn_state_S3', 'conn_state_SF',
                         'service_dns', 'service_http', 'service_irc', 'service_ssh']
    
    # Add missing columns with zeros
    for col in training_features:
        if col not in df.columns:
            df[col] = 0

    # Ensure column order matches
    df = df[training_features]
    return df, original_data


In [12]:
# Predict and save results
def predict_and_save(file_path, output_path):
    # Preprocess the new data
    preprocessed_data, original_data = preprocess_data(file_path)
    
    # Predict labels
    predictions = xgb_model.predict(preprocessed_data)
    
    # Decode predictions
    decoded_predictions = label_encoder.inverse_transform(predictions)
    
    # Add predictions to the original data
    original_data['predicted_label'] = decoded_predictions
    
    # Add a new column for attacker's IP
    original_data['attacker_ip'] = original_data.apply(
        lambda row: row['id.orig_h'] if row['predicted_label'] != 'Benign' else None, axis=1
    )
    
    # Save results
    original_data.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")


In [13]:
# File paths
input_file = 'C:/Users/kscna/Desktop/tra_ind_conn.log.txt'  # Replace with actual input file path
output_file = 'D:/Major Project/edge computing/predicted.csv'  # Replace with desired output file path

In [14]:
# Run prediction
predict_and_save(input_file, output_file)

Predictions saved to D:/Major Project/edge computing/predicted.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feature].fillna(df[feature].mean(), inplace=True)
