<a href="https://colab.research.google.com/github/Sriii27/graph-analysis/blob/main/Cn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scapy pandas tqdm


Collecting scapy
  Downloading scapy-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading scapy-2.7.0-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scapy
Successfully installed scapy-2.7.0


In [None]:
from google.colab import files
files.upload()


In [5]:
from scapy.all import PcapReader, IP, TCP, UDP
import pandas as pd
from tqdm import tqdm

def pcap_to_df(pcap_path, label):
    rows = []

    with PcapReader(pcap_path) as pcap:
        for pkt in tqdm(pcap, desc=f"Processing {pcap_path}"):
            try:
                if IP not in pkt:
                    continue

                src_ip = pkt[IP].src
                dst_ip = pkt[IP].dst
                protocol = pkt[IP].proto

                src_port = None
                dst_port = None
                if TCP in pkt:
                    src_port = pkt[TCP].sport
                    dst_port = pkt[TCP].dport
                elif UDP in pkt:
                    src_port = pkt[UDP].sport
                    dst_port = pkt[UDP].dport

                rows.append({
                    "src_ip": src_ip,
                    "dst_ip": dst_ip,
                    "protocol": protocol,
                    "src_port": src_port,
                    "dst_port": dst_port,
                    "packet_length": len(pkt),
                    "label": label
                })

            except Exception:
                continue

    return pd.DataFrame(rows)

# ---- PCAP FILES (COLAB PATHS) ----
MALWARE_PCAP = "/content/2013-11-06_capture-win18 (1).pcap"
NORMAL_PCAP  = "/content/2017-04-19_win-normal (1).pcap"

# ---- EXTRACT ----
df_malware = pcap_to_df(MALWARE_PCAP, label=1)
df_normal  = pcap_to_df(NORMAL_PCAP, label=0)

# ---- COMBINE ----
combined_df = pd.concat([df_malware, df_normal], ignore_index=True)

# ---- SAVE ----
combined_df.to_csv("/content/combined_pcap_labeled.csv", index=False)

print("✅ PCAP → CSV completed")
print("Final shape:", combined_df.shape)
combined_df.head()


Processing /content/2013-11-06_capture-win18 (1).pcap: 136782it [00:59, 2303.25it/s]
Processing /content/2017-04-19_win-normal (1).pcap: 265376it [01:42, 2583.06it/s]


✅ PCAP → CSV completed
Final shape: (374507, 7)


Unnamed: 0,src_ip,dst_ip,protocol,src_port,dst_port,packet_length,label
0,10.0.2.118,8.8.8.8,17,55305.0,53.0,76,1
1,10.0.2.118,8.8.4.4,17,55305.0,53.0,76,1
2,8.8.4.4,10.0.2.118,17,53.0,55305.0,92,1
3,10.0.2.118,8.8.4.4,17,62446.0,53.0,76,1
4,8.8.4.4,10.0.2.118,17,53.0,62446.0,104,1


In [None]:
from google.colab import files
files.download("combined_pcap_labeled.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import pandas as pd

df = pd.read_csv("/content/combined_pcap_labeled.csv")
print(df.shape)
df.head()


(703497, 7)


Unnamed: 0,src_ip,dst_ip,protocol,src_port,dst_port,packet_length,label
0,10.0.2.118,8.8.8.8,17,55305.0,53.0,76,1
1,10.0.2.118,8.8.4.4,17,55305.0,53.0,76,1
2,8.8.4.4,10.0.2.118,17,53.0,55305.0,92,1
3,10.0.2.118,8.8.4.4,17,62446.0,53.0,76,1
4,8.8.4.4,10.0.2.118,17,53.0,62446.0,104,1


In [7]:
df = df.fillna(0)


In [8]:
X = df.drop(columns=["label"])
y = df["label"]


In [9]:
from sklearn.preprocessing import LabelEncoder

for col in ["src_ip", "dst_ip", "protocol"]:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


feature selection

In [11]:
#2.1 Remove low‑variance features
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.01)
X_var = vt.fit_transform(X_scaled)

selected_features_vt = X.columns[vt.get_support()]
print("After variance filter:", X_var.shape)


After variance filter: (703497, 6)


In [12]:
#2.2 Correlation-based feature removal
import numpy as np

X_var_df = pd.DataFrame(X_var, columns=selected_features_vt)
corr = X_var_df.corr().abs()

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = ambulance = [col for col in upper.columns if any(upper[col] > 0.9)]

X_corr = X_var_df.drop(columns=to_drop)
print("After correlation filter:", X_corr.shape)


After correlation filter: (703497, 5)


In [13]:
#2.3 Feature Importance (Random Forest – BEST for malware)
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_corr, y)

importances = pd.Series(
    rf.feature_importances_,
    index=X_corr.columns
).sort_values(ascending=False)

importances


Unnamed: 0,0
dst_ip,0.456004
src_ip,0.321563
src_port,0.210625
packet_length,0.011434
protocol,0.000375


In [None]:
#2.4 Select Top‑K features
TOP_K = 10
top_features = importances.head(TOP_K).index
X_selected = X_corr[top_features]

print("Final feature set:", X_selected.columns.tolist())


Final feature set: ['dst_ip', 'src_ip', 'src_port', 'packet_length', 'protocol']


MODEL SELECTION

In [None]:
#3.1 Train‑test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
#3.2 Model 1 — Random Forest (Baseline ⭐)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     52978
           1       1.00      1.00      1.00     21924

    accuracy                           1.00     74902
   macro avg       1.00      1.00      1.00     74902
weighted avg       1.00      1.00      1.00     74902



In [None]:
#3.3 Model 2 — Support Vector Machine
from sklearn.svm import SVC

svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


SVM Accuracy: 0.9955408400309738


In [None]:
#.4 Model 3 — Logistic Regression (Baseline)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

NameError: name 'X_train' is not defined