In [None]:
# ===============================
# 📦 Step 1: Load & Preprocess
# ===============================

import pandas as pd

# Load dataset
df = pd.read_csv("traffic_dataset.csv")

# Display first few rows
print(df.head())

# Check for missing values and data types
print(df.info())

   Start time   Last time Attack category   Attack subcategory Protocol  \
0  1421927414  1421927416  Reconnaissance                 HTTP      tcp   
1  1421927415  1421927415        Exploits     Unix 'r' Service      udp   
2  1421927416  1421927416        Exploits              Browser      tcp   
3  1421927417  1421927417        Exploits  Miscellaneous Batch      tcp   
4  1421927418  1421927418        Exploits           Cisco IOS       tcp   

      Source IP  Source Port  Destination IP  Destination Port  \
0  175.45.176.0        13284  149.171.126.16              80.0   
1  175.45.176.3        21223  149.171.126.18           32780.0   
2  175.45.176.2        23357  149.171.126.16              80.0   
3  175.45.176.2        13792  149.171.126.16            5555.0   
4  175.45.176.2        26939  149.171.126.10              80.0   

                                         Attack Name  \
0  Domino Web Server Database Access: /doladmin.n...   
1  Solaris rwalld Format String Vulnerab

In [None]:
#Feature Engineering where in i Convert timestamps (Start time, Last time) into datetime format.
#Handle missing values (Attack subcategory, Attack Name, Attack Reference).
#Encode categorical variables (Protocol, Attack category, Attack subcategory).

import pandas as pd

# Convert timestamps
df['Start time'] = pd.to_datetime(df['Start time'], unit='s')
df['Last time'] = pd.to_datetime(df['Last time'], unit='s')

# Fill missing values
df.fillna("Unknown", inplace=True)

# Encode categorical features
df['Protocol'] = df['Protocol'].astype('category').cat.codes
df['Attack category'] = df['Attack category'].astype('category').cat.codes
df['Attack subcategory'] = df['Attack subcategory'].astype('category').cat.codes

  df.fillna("Unknown", inplace=True)


In [None]:
# print(df.head())  # Shows the first 5 rows

# print(df.dtypes)

# print(df.isnull().sum())  # Should show 0 for all columns
# print(df[['Protocol', 'Attack category', 'Attack subcategory']].head(10))

In [None]:
#Convert Traffic Data into Text Format for LLM

df['traffic_text'] = df.apply(lambda row:
    f"Attack: {row['Attack category']}, Subcategory: {row['Attack subcategory']}, "
    f"Protocol: {row['Protocol']}, Source: {row['Source IP']}:{row['Source Port']}, "
    f"Destination: {row['Destination IP']}:{row['Destination Port']}, "
    f"Attack Name: {row['Attack Name']}", axis=1)

# Display sample text
print(df['traffic_text'].head())

In [None]:
# Sampling
print(df.shape)  # Check number of rows and columns
df_sample = df.sample(n=5000, random_state=42)  # Take 5000 samples

In [None]:
# ===============================
# ✂️ Step 2: Tokenization
# ===============================

from transformers import AutoTokenizer
import torch
import gc

# Use lightweight BERT model to avoid crashes
model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize all at once (5000 is small enough)
tokens = tokenizer(df_sample['traffic_text'].tolist(),
                   padding=True, truncation=True, return_tensors="pt")

print("✅ Tokenization completed.")

In [None]:
from transformers import AutoModel
import torch
import gc

# Load tiny BERT model
model_name = "prajjwal1/bert-tiny"
model = AutoModel.from_pretrained(model_name)
model = model.to('cpu')
model.eval()

# Define mini batch size (safe)
mini_batch_size = 256

# Store all embeddings
all_embeddings = []

# Iterate over batches
for i in range(0, len(df_sample), mini_batch_size):
    print(f"Embedding batch: {i} to {i+mini_batch_size}")
    sub_texts = df_sample['traffic_text'].iloc[i:i + mini_batch_size].tolist()

    # Tokenize this mini-batch
    tokens = tokenizer(
          sub_texts,
          padding=True,
          truncation=True,
          max_length=512,
          return_tensors="pt"
      ).to('cpu')


    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings)

    # Clear memory
    del tokens, outputs, embeddings
    gc.collect()
    torch.cuda.empty_cache()

print("✅ All mini-batch embeddings done.")

# Combine all batches into a single tensor
final_embeddings = torch.cat(all_embeddings, dim=0)
print(f"Final embedding shape: {final_embeddings.shape}")


In [None]:
#step1:Train a Classifier to Predict Attack Category

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Input and target
X = final_embeddings.numpy()
y = df_sample['Attack category']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("✅ Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
#Visualize Your Embeddings

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# PCA to 2D
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='viridis', alpha=0.6)
plt.title("Encrypted Traffic Embeddings Visualized via PCA")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.colorbar(label='Attack Category')
plt.grid(True)
plt.show()


In [None]:
import joblib
import numpy as np

# Save embeddings
np.save("bert_embeddings_5000.npy", X)

# Save the classifier
joblib.dump(clf, "attack_classifier.pkl")

# Save the sample dataframe
df_sample.to_csv("traffic_sample_processed.csv", index=False)

print("✅ All files saved for future use.")


In [None]:
pip install gradio

In [None]:
import gradio as gr
import joblib
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# Load model & tokenizer once
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = AutoModel.from_pretrained("prajjwal1/bert-tiny")
model.eval()

# Load trained classifier
clf = joblib.load("attack_classifier.pkl")

# Attack label mapping
label_map = {
    0: "Benign",
    1: "Brute Force",
    2: "Port Scan",
    3: "XSS",
    4: "SQL Injection",
    5: "Infiltration",
    6: "Botnet",
    7: "DDoS",
    8: "DoS",
    9: "MITM",
    10: "DNS Tunneling",
    11: "Unknown"
}

# Prediction function
def predict_attack(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state.mean(dim=1).numpy()

    prediction = clf.predict(embedding)[0]
    label = label_map.get(prediction, "Unknown")
    return f"Predicted Attack Category: {label} (Label {prediction})"

# Example inputs
examples = [
    "Attack: 1, Subcategory: 2, Protocol: 6, Source: 192.168.1.10:443, Destination: 10.0.0.5:80, Attack Name: DDoS_HTTP_Flood",
    "Attack: 2, Subcategory: 3, Protocol: 17, Source: 192.168.1.15:53, Destination: 10.0.0.6:123, Attack Name: DNS_Amplification",
    "Attack: 5, Subcategory: 1, Protocol: 1, Source: 10.0.0.10:80, Destination: 192.168.1.100:443, Attack Name: Infiltration_TCP_Scan"
]

# Gradio interface
gr.Interface(
    fn=predict_attack,
    inputs=gr.Textbox(label="Enter traffic summary"),
    outputs="text",
    title="Traffic Attack Classifier",
    examples=examples
).launch()


In [None]:
!pip install flask-ngrok

In [None]:
!pip install pyngrok

In [None]:
!ngrok authtoken 2vS1CI8GKxtpu6HaRoqKEqf31y1_638BG6RckyfWzQ7ehgeUb

In [None]:
!pip install flask pyngrok

In [None]:
pip install transformers joblib

In [None]:
# from flask import Flask, request, render_template
# from pyngrok import ngrok
# import torch
# import joblib
# import numpy as np
# import pandas as pd
# from transformers import AutoTokenizer, AutoModel

# # Flask app
# app = Flask(__name__)

# # Load model and data
# clf = joblib.load("attack_classifier.pkl")
# df = pd.read_csv("traffic_sample_processed.csv")
# attack_category_decoder = dict(enumerate(df['Attack category'].astype('category').cat.categories))

# # Load tokenizer and BERT model
# model_name = "prajjwal1/bert-tiny"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)
# model.eval()

# @app.route('/')
# def index():
#     return render_template("index.html")

# @app.route('/predict', methods=['POST'])
# def predict():
#     text = request.form['text']

#     tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**tokens)
#         embedding = outputs.last_hidden_state.mean(dim=1).numpy()

#     pred_code = clf.predict(embedding)[0]
#     prediction_label = attack_category_decoder[pred_code]

#     return render_template("index.html", prediction_text=f"Predicted Attack Category: {prediction_label}")

# if __name__ == '__main__':
#     # Create ngrok tunnel
#     public_url = ngrok.connect(5000)
#     print(f"🚀 Your app is publicly available at: {public_url}")

#     # Run Flask app
#     app.run(port=5000)


In [None]:
# <!DOCTYPE html>
# <html>
# <head>
#   <title>Traffic Attack Predictor</title>
#   <style>
#     body {
#       font-family: Arial, sans-serif;
#       margin: 60px;
#       background-color: #f4f4f4;
#     }
#     .container {
#       background: white;
#       padding: 20px;
#       border-radius: 8px;
#       max-width: 700px;
#       margin: auto;
#       box-shadow: 0px 0px 10px rgba(0,0,0,0.1);
#     }
#     textarea {
#       width: 100%;
#       height: 150px;
#       padding: 10px;
#       font-size: 16px;
#       margin-bottom: 20px;
#     }
#     button {
#       padding: 10px 20px;
#       font-size: 16px;
#       background: #007BFF;
#       color: white;
#       border: none;
#       border-radius: 6px;
#       cursor: pointer;
#     }
#     .result {
#       margin-top: 20px;
#       font-weight: bold;
#       color: #333;
#     }
#   </style>
# </head>
# <body>
#   <div class="container">
#     <h2>🚦 Encrypted Traffic Attack Classifier</h2>
#     <form action="/predict" method="post">
#       <textarea name="text" placeholder="Enter network traffic details here..."></textarea>
#       <button type="submit">Predict</button>
#     </form>
#     {% if prediction_text %}
#       <div class="result">{{ prediction_text }}</div>
#     {% endif %}
#   </div>
# </body>
# </html>
