In [1]:
!pip install scapy
import pandas as pd
from scapy.all import *
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scipy import stats

def extract_packet_features(packet):
    features = {}
    features['Time'] = packet.time
    features['SourceIP'] = packet[IP].src if IP in packet else 'N/A'
    features['DestIP'] = packet[IP].dst if IP in packet else 'N/A'
    features['SrcPort'] = packet[TCP].sport if TCP in packet else 'N/A'
    features['DestPort'] = packet[TCP].dport if TCP in packet else 'N/A'
    features['Protocol'] = 'TCP' if TCP in packet else 'N/A'
    features['PacketLength'] = len(packet)

    # Additional features
    features['PayloadLength'] = len(packet[TCP].payload) if TCP in packet and Raw in packet[TCP] else 0
    features['PacketCountSrcIP'] = packet[IP].src if IP in packet else 'N/A'
    features['PacketCountDestIP'] = packet[IP].dst if IP in packet else 'N/A'

    return features

def calcStats(df):
    try:
        min_size = df['PacketLength'].min()
        avg_size = df['PacketLength'].mean()
        max_size = df['PacketLength'].max()

        print(f"Minimum Packet Size: {min_size} bytes")
        print(f"Average Packet Size: {avg_size} bytes")
        print(f"Maximum Packet Size: {max_size} bytes")
    except Exception as e:
        print(f"Error calculating packet statistics: {e}")

def label_traffic(row):
    # Manual labeling according to a condition is an example.
    if row['Protocol'] == 'TCP' and row['PacketLength'] > 100:
        return 'Malicious'
    else:
        return 'Normal'

def preprocess_pcap(pcap_file, output_csv):
    packets = rdpcap(pcap_file)
    extracted_features = []

    for packet in packets:
        features = extract_packet_features(packet)
        extracted_features.append(features)

    df = pd.DataFrame(extracted_features, columns=['Time', 'SourceIP', 'DestIP', 'SrcPort', 'DestPort', 'Protocol', 'PacketLength', 'PayloadLength', 'PacketCountSrcIP', 'PacketCountDestIP'])

    # Manage missing values by substituting 'N/A' with the relevant handling.
    df.replace('N/A', pd.NA, inplace=True)
    df.dropna(inplace=True)


   # Handle outliers by applying z-score
    z_scores = stats.zscore(df[['PacketLength', 'PayloadLength']])
    df = df[(z_scores < 3).all(axis=1)]

    calcStats(df)

    # Standardize features that are numerical.
    scaler = MinMaxScaler()
    df[['PacketLength', 'PayloadLength']] = scaler.fit_transform(df[['PacketLength', 'PayloadLength']])

    # Incorporate categorical attributes
    label_encoder = LabelEncoder()
    df['Protocol'] = label_encoder.fit_transform(df['Protocol'])

    df['Label'] = df.apply(label_traffic, axis=1)

    print(df)

    # Saving the df to a CSV file
    df.to_csv(output_csv, index=False)

    return df

if __name__ == '__main__':
    pcap_file = 'ass.pcap'  
    output_csv = 'output.csv'
    df = preprocess_pcap(pcap_file, output_csv)


Collecting scapy
  Downloading scapy-2.5.0.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25l[?25hdone
  Created wheel for scapy: filename=scapy-2.5.0-py2.py3-none-any.whl size=1444330 sha256=efad12b45bcd3cfe6f3e2306e5fb74e4d0a1d8ade222476c89b999fcdc7f2467
  Stored in directory: /root/.cache/pip/wheels/82/b7/03/8344d8cf6695624746311bc0d389e9d05535ca83c35f90241d
Successfully built scapy
Installing collected packages: scapy
Successfully installed scapy-2.5.0
Minimum Packet Size: 54 bytes
Average Packet Size: 839.7907020353362 bytes
Maximum Packet Size: 6287 bytes
                     Time       SourceIP         DestIP SrcPort DestPort  \
0       1618182682.208384  103.255.15.27  103.255.15.23   51898    42000   
1       1618182682.208531  103.255.15.23 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix


# Loading the dataset preprocessed
df = pd.read_csv('output.csv')

# Separating features (X) and labels (y)
X = df.drop('Label', axis=1)
y = df['Label']

# One-hot encoding the columns categorical
X_encoded = pd.get_dummies(X)

# Spliting the dataset into training and testing sets below:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.1, random_state=42)

# Initializing a Random Forest classifier here:
clf = RandomForestClassifier(random_state=42)

# Training the model
clf.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluating the model for results
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Malicious', zero_division=1)
recall = recall_score(y_test, y_pred, pos_label='Malicious',zero_division=1)
f1 = f1_score(y_test, y_pred, pos_label='Malicious',zero_division=1)

# Printing the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Confusion Matrix:
[[26664]]
