In [1]:
import pandas as pd

# Load CSV data
df = pd.read_csv(r"C:\Users\halge\Downloads\network_log_dataset (1).csv")

# Display the column names
print(df.columns)


Index(['Timestamp', 'Source IP', 'Destination IP', 'Request Type',
       'Status Code', 'Bytes Transferred', 'User Agent', 'Anomaly Detected',
       'Attack Type'],
      dtype='object')


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\halge\Downloads\network_log_dataset (1).csv")

# Display the first few rows to understand the structure
print(df.head())


             Timestamp     Source IP Destination IP Request Type  Status Code  \
0  2024-08-31 10:15:00  192.168.1.70     172.16.0.5         POST          403   
1  2024-08-31 10:15:08  192.168.1.70     172.16.0.5         POST          500   
2  2024-08-31 10:15:22  192.168.1.50     172.16.0.5          GET          500   
3  2024-08-31 10:15:35  192.168.1.25     172.16.0.5         POST          200   
4  2024-08-31 10:15:50  192.168.1.52     172.16.0.5          GET          401   

   Bytes Transferred     User Agent  \
0               3086    Chrome/58.0   
1               2070    Chrome/58.0   
2               1741  Safari/537.36   
3               4023  Safari/537.36   
4               4078    Chrome/58.0   

                            Anomaly Detected         Attack Type  
0                  Suspected flooding attack         DDoS Attack  
1  Unusual number of authentication failures  Brute Force Attack  
2               Accessing non-existent pages                 NaN  
3  Unusual

In [3]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values in 'Attack Type' with 'None' (indicating no attack)
df['Attack Type'].fillna('None', inplace=True)


Timestamp               0
Source IP               0
Destination IP          0
Request Type            0
Status Code             0
Bytes Transferred       0
User Agent              0
Anomaly Detected     1000
Attack Type          1376
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Attack Type'].fillna('None', inplace=True)


In [4]:
df.head()

Unnamed: 0,Timestamp,Source IP,Destination IP,Request Type,Status Code,Bytes Transferred,User Agent,Anomaly Detected,Attack Type
0,2024-08-31 10:15:00,192.168.1.70,172.16.0.5,POST,403,3086,Chrome/58.0,Suspected flooding attack,DDoS Attack
1,2024-08-31 10:15:08,192.168.1.70,172.16.0.5,POST,500,2070,Chrome/58.0,Unusual number of authentication failures,Brute Force Attack
2,2024-08-31 10:15:22,192.168.1.50,172.16.0.5,GET,500,1741,Safari/537.36,Accessing non-existent pages,
3,2024-08-31 10:15:35,192.168.1.25,172.16.0.5,POST,200,4023,Safari/537.36,Unusual number of authentication failures,Brute Force Attack
4,2024-08-31 10:15:50,192.168.1.52,172.16.0.5,GET,401,4078,Chrome/58.0,Accessing non-existent pages,


In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import numpy as np
import pandas as pd

# Load data from a CSV file (replace 'your_file.csv' with your actual file name)
data = pd.read_csv(r"C:\Users\halge\Downloads\network_log_dataset (1).csv")

# Display the first few rows to ensure data is loaded correctly
print(data.head())


             Timestamp     Source IP Destination IP Request Type  Status Code  \
0  2024-08-31 10:15:00  192.168.1.70     172.16.0.5         POST          403   
1  2024-08-31 10:15:08  192.168.1.70     172.16.0.5         POST          500   
2  2024-08-31 10:15:22  192.168.1.50     172.16.0.5          GET          500   
3  2024-08-31 10:15:35  192.168.1.25     172.16.0.5         POST          200   
4  2024-08-31 10:15:50  192.168.1.52     172.16.0.5          GET          401   

   Bytes Transferred     User Agent  \
0               3086    Chrome/58.0   
1               2070    Chrome/58.0   
2               1741  Safari/537.36   
3               4023  Safari/537.36   
4               4078    Chrome/58.0   

                            Anomaly Detected         Attack Type  
0                  Suspected flooding attack         DDoS Attack  
1  Unusual number of authentication failures  Brute Force Attack  
2               Accessing non-existent pages                 NaN  
3  Unusual

In [10]:
# Combine relevant columns into a single text feature
data['text'] = data['Source IP'] + ' ' + data['Destination IP'] + ' ' + data['Request Type'] + ' ' + data['Status Code'].astype(str) + ' ' + data['User Agent'] + ' ' + data['Anomaly Detected'] + ' ' + data['Attack Type']

# Define labels based on the presence of anomalies
data['label'] = np.where(data['Anomaly Detected'] != '', 1, 0)

# Display the updated dataframe
print(data[['text', 'label']].head())


                                                text  label
0  192.168.1.70 172.16.0.5 POST 403 Chrome/58.0 S...      1
1  192.168.1.70 172.16.0.5 POST 500 Chrome/58.0 U...      1
2                                                NaN      1
3  192.168.1.25 172.16.0.5 POST 200 Safari/537.36...      1
4                                                NaN      1


In [11]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)


In [13]:
print(type(train_texts))
print(type(test_texts))


<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [16]:
print([type(text) for text in train_texts_list[:5]])  # Print types of the first 5 elements
print([type(text) for text in test_texts_list[:5]])


[<class 'float'>, <class 'float'>, <class 'str'>, <class 'str'>, <class 'float'>]
[<class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>]


In [17]:
from transformers import AutoTokenizer

# Convert any non-string elements to strings
train_texts_list = [str(text) if isinstance(text, (str, float)) else 'missing' for text in train_texts_list]
test_texts_list = [str(text) if isinstance(text, (str, float)) else 'missing' for text in test_texts_list]

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts_list, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts_list, truncation=True, padding=True, max_length=128)


In [18]:
class NetworkTrafficDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NetworkTrafficDataset(train_encodings, train_labels.tolist())
test_dataset = NetworkTrafficDataset(test_encodings, test_labels.tolist())


In [19]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define training arguments
training_args = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
testing_args = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.

In [20]:
# Set up the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(3):  # Change the number of epochs based on your needs
    model.train()
    for batch in training_args:
        optimizer.zero_grad()
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")


Epoch 1 completed with loss: 0.0017829512944445014
Epoch 2 completed with loss: 0.0004368078662082553
Epoch 3 completed with loss: 0.00014225782069843262


In [21]:
# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in testing_args:
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Print the classification report
print(classification_report(true_labels, predictions))

# Confusion Matrix
print(confusion_matrix(true_labels, predictions))


              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1000

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

[[1000]]


In [24]:
# Generate alerts for anomalous activities, excluding 'nan' values
for i, prediction in enumerate(predictions):
    if prediction == 1:
        text_entry = str(test_texts.iloc[i])
        if text_entry != 'nan':
            ip_address = text_entry.split()[0]
            print(f"Alert: Anomalous activity detected from IP {ip_address}")


Alert: Anomalous activity detected from IP 192.168.1.40
Alert: Anomalous activity detected from IP 192.168.1.30
Alert: Anomalous activity detected from IP 192.168.1.22
Alert: Anomalous activity detected from IP 192.168.1.62
Alert: Anomalous activity detected from IP 192.168.1.62
Alert: Anomalous activity detected from IP 192.168.1.60
Alert: Anomalous activity detected from IP 192.168.1.60
Alert: Anomalous activity detected from IP 192.168.1.75
Alert: Anomalous activity detected from IP 192.168.1.51
Alert: Anomalous activity detected from IP 192.168.1.85
Alert: Anomalous activity detected from IP 192.168.1.35
Alert: Anomalous activity detected from IP 192.168.1.75
Alert: Anomalous activity detected from IP 192.168.1.12
Alert: Anomalous activity detected from IP 192.168.1.55
Alert: Anomalous activity detected from IP 192.168.1.80
Alert: Anomalous activity detected from IP 192.168.1.62
Alert: Anomalous activity detected from IP 192.168.1.35
Alert: Anomalous activity detected from IP 192.1

In [30]:
import pandas as pd

# Assuming 'data' is your original DataFrame
# And 'test_labels' is the series of labels from your test set

# Indexes for test set
test_indices = test_labels.index

# Define the anomalies Series based on the test indices
anomalies = data.loc[test_indices, 'Anomaly Detected']

# Check if 'anomalies' is defined correctly
print(anomalies)


1501                      Presence of OR 1=1 in query
2586    Multiple requests per second from the same IP
2653        Unusual number of authentication failures
1055                      Presence of OR 1=1 in query
705                         Suspected flooding attack
                            ...                      
4711                                              NaN
2313               Malformed SQL query in the request
3214               Malformed SQL query in the request
2732               Malformed SQL query in the request
1926                     High volume of POST requests
Name: Anomaly Detected, Length: 1000, dtype: object


1    Unusual number of authentication failures
Name: Anomaly Detected, dtype: object


In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Sample data
data = pd.DataFrame({
    'Source IP': ['192.168.1.70', '192.168.1.70', '192.168.1.50', '192.168.1.25', '192.168.1.52'],
    'Destination IP': ['172.16.0.5'] * 5,
    'Request Type': ['POST', 'POST', 'GET', 'POST', 'GET'],
    'Status Code': [403, 500, 500, 200, 401],
    'Bytes Transferred': [3086, 2070, 1741, 4023, 4078],
    'User Agent': ['Chrome/58.0', 'Chrome/58.0', 'Safari/537.36', 'Safari/537.36', 'Chrome/58.0'],
    'Anomaly Detected': ['Suspected flooding attack', 'Unusual number of authentication failures', 'Accessing non-existent pages', 'Unusual number of authentication failures', 'Accessing non-existent pages'],
    'Attack Type': ['DDoS Attack', 'Brute Force Attack', 'None', 'Brute Force Attack', 'None']
})

# Combine relevant columns into a single text feature
data['text'] = data['Source IP'] + ' ' + data['Destination IP'] + ' ' + data['Request Type'] + ' ' + data['Status Code'].astype(str) + ' ' + data['Bytes Transferred'].astype(str) + ' ' + data['User Agent'] + ' ' + data['Anomaly Detected'] + ' ' + data['Attack Type']

# Define labels based on the presence of anomalies
data['label'] = np.where(data['Anomaly Detected'] != '', 1, 0)

# Split the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Define the check_anomaly function
def check_anomaly(ip_to_check, test_texts, predictions, data):
    """
    Check if the given IP address has an associated anomaly based on predictions.

    Parameters:
    - ip_to_check (str): The IP address to check.
    - test_texts (pd.Series): The Series containing the test texts.
    - predictions (list): The list of predictions for anomalies.
    - data (pd.DataFrame): The original DataFrame with anomaly descriptions and attack types.

    Returns:
    - str: A message indicating whether the IP address has anomalies.
    """
    ip_to_check = str(ip_to_check)
    
    # Filter the original data based on IP address and predictions
    filtered_data = data[(data['Source IP'] == ip_to_check) & (data['label'] == 1)]
    
    if filtered_data.empty:
        return f"No anomalies detected for IP {ip_to_check}"
    
    # Retrieve the anomaly descriptions and attack types
    anomalies = filtered_data[['Anomaly Detected', 'Attack Type']]
    anomaly_info = anomalies.apply(lambda row: f"Anomaly Description: {row['Anomaly Detected']}, Attack Type: {row['Attack Type']}", axis=1)
    
    return f"Alert: Anomalous activity detected from IP {ip_to_check}. " + " ".join(anomaly_info)

# Example usage
ip_address = "192.168.1.70"  # Replace with the IP address you want to check
print(check_anomaly(ip_address, test_texts, predictions, data))


Alert: Anomalous activity detected from IP 192.168.1.70. Anomaly Description: Suspected flooding attack, Attack Type: DDoS Attack Anomaly Description: Unusual number of authentication failures, Attack Type: Brute Force Attack
