In [None]:
# The project is about analyzing and classifying network intrusion data
# the dataset is : KDD Cup 1999 Data
# Data Size: contains over 4.9 million records
# Multi-class

In [None]:
# Check GPU availability
!nvidia-smi

Tue Apr 15 13:49:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import cudf
import cuml
import cupy as cp
import time
import seaborn as sns
import matplotlib.pyplot as plt
from cuml.model_selection import train_test_split
from cuml.preprocessing import StandardScaler, LabelEncoder
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier
from cuml.neighbors import KNeighborsClassifier
from cuml.metrics import accuracy_score
from sklearn.utils import resample
import pandas as pd
import os



In [None]:
!wget http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz
!gunzip -f kddcup.data.gz

--2025-04-15 20:08:19--  http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18115902 (17M) [application/x-gzip]
Saving to: ‘kddcup.data.gz’


2025-04-15 20:08:19 (60.7 MB/s) - ‘kddcup.data.gz’ saved [18115902/18115902]



In [None]:
column_names = [
    'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files',
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
    'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

In [None]:
N_SAMPLES = 3000000
# GPU Data Loading
start_time = time.time()
gdf = cudf.read_csv('kddcup.data', nrows=N_SAMPLES, header=None, names=column_names)
load_time = time.time() - start_time
print(f"GPU data loading time: {load_time:.2f} seconds")



GPU data loading time: 0.69 seconds


In [None]:
print("\n First five columns")
print(gdf.head())

print("\n Distribution of categories:")
print(gdf['label'].value_counts())

print("\nNumber of missing values:")
print(gdf.isnull().sum())

print("\nDataset shape:", gdf.shape)

print(f"Rows: {gdf.shape[0]}")
print(f"Columns : {gdf.shape[1]}")


 First five columns
   protocol_type service  flag src_bytes  dst_bytes   land  wrong_fragment  \
0              0     tcp  http        SF        215  45076               0   
1              0     tcp  http        SF        162   4528               0   
2              0     tcp  http        SF        236   1228               0   
3              0     tcp  http        SF        233   2032               0   
4              0     tcp  http        SF        239    486               0   

   urgent  hot  num_failed_logins  ...  dst_host_srv_count  \
0       0    0                  0  ...                   0   
1       0    0                  0  ...                   1   
2       0    0                  0  ...                   2   
3       0    0                  0  ...                   3   
4       0    0                  0  ...                   4   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                       0                     0.0   
1                       1      

In [None]:
print("\n Distribution of labels (Multi-class):")
print(gdf['label'].value_counts())


 Distribution of binary labels:
binary_label
1    3000000
Name: count, dtype: int64


In [None]:
label_counts = gdf['label'].to_pandas().value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=label_counts.index.astype(str), y=label_counts.values)
plt.xticks(rotation=90)
plt.title("Distribution of Labels in the Dataset")
plt.xlabel("Label")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
# num_outbound_cmds # يقيس عدد الاوامر الخارجة من الضحيه وغالبا صفر فما ياثر  ولل له دخل بالنومرمال والاتاك
# is_host_login  # لان العمود غالبا قيمتة صفر فبرضو ما يستفيد منه النموذج
# su_attempted  # شي يتكون من الهجمات وما ياثر ايضا فبس بيسب نويز

columns_to_drop = ['num_outbound_cmds', 'is_host_login', 'su_attempted']
gdf = gdf.drop(columns=columns_to_drop)
gdf = gdf.drop(columns=['src_bytes'])

X = gdf.drop(columns=['label'])
y = gdf['label']


In [None]:
label_encoder = LabelEncoder()
for col in ['protocol_type', 'service', 'flag']:
    X[col] = label_encoder.fit_transform(X[col].astype(str))

non_numeric_cols = []
for col in X.columns:
    try:
        X[col] = cudf.to_numeric(X[col], errors='raise')
    except Exception as e:
        print(f" العمود {col} فيه مشكلة: {e}")
        non_numeric_cols.append(col)

X = X.drop(columns=non_numeric_cols)
X = X.astype('float32')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

start_time = time.time()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = X_test_scaled.fillna(0)
y_test = y_test.fillna(0)
preprocess_time = time.time() - start_time
print(f"GPU preprocessing time: {preprocess_time:.2f} seconds")

In [None]:
#Oversampling
X_train_cpu = X_train_scaled.to_pandas()
y_train_cpu = y_train.to_pandas()

train_df = X_train_cpu.copy()
train_df['label'] = y_train_cpu

majority = train_df[train_df['label'] == 0]
minority = train_df[train_df['label'] != 0]

if len(minority) > 0:
    minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
    balanced_df = cudf.DataFrame(pd.concat([majority, minority_upsampled]))
    print(" Data has been rebalanced.")
else:
    balanced_df = cudf.DataFrame(train_df)
    print(" اFor the minority class, it is empty or contains very few data.")

X_train_scaled = balanced_df.drop('label', axis=1).astype('float32')
X_train_scaled = X_train_scaled.fillna(0)
y_train = balanced_df['label']
y_train = y_train.fillna(0)


In [None]:
correlation_matrix = X_train_scaled.to_pandas().corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0)
plt.title("Heatmap of Feature Correlation", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# sample_df = train_df.sample(n=5000, random_state=42)
# profile = ProfileReport(sample_df, title=" KDD99 Data Profiling Report", explorative=True)
# profile.to_file("kdd99_profiling_report.html")
# print("\n The file saved kdd99_profiling_report.html")


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='qn'),
    "Random Forest": RandomForestClassifier(n_estimators=50),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    print(f"\n Training {name}...")
    start = time.time()
    model.fit(X_train_scaled, y_train)
    print(f"{name} training time: {time.time() - start:.2f} seconds")


In [None]:
def evaluate_model(model, name, X_test, y_test):
    start = time.time()
    y_pred = model.predict(X_test)
    y_pred = y_pred.to_numpy() if isinstance(y_pred, (cudf.Series, cp.ndarray)) else y_pred
    y_test_np = y_test.to_numpy() if isinstance(y_test, (cudf.Series, cp.ndarray)) else y_test
    acc = accuracy_score(y_test_np, y_pred)
    print(f"\n {name} Accuracy: {acc * 100:.2f}%")
    print(f" {name} Prediction time: {time.time() - start:.2f} seconds")

for name, model in models.items():
    evaluate_model(model, name, X_test_scaled, y_test)


In [None]:
from cuml.metrics import accuracy_score
import joblib


best_model = None
best_score = 0
best_name = ""

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test.to_numpy(), y_pred.to_numpy())
    if acc > best_score:
        best_score = acc
        best_model = model
        best_name = name

# بيطلع البست مودل حسب الاوتبوت فوق
joblib.dump(best_model, 'model.joblib')
joblib.dump(scaler, 'scalar.joblib')

X_test_scaled.to_pandas().to_csv('X_test.csv', index=False)
y_test.to_pandas().to_csv('y_test.csv', index=False)

print(f"\n Saved best model: {best_name} with accuracy {best_score * 100:.2f}%")



In [None]:
from cuml.metrics import accuracy_score
import joblib
import pandas as pd

# انشات هنا ملف البردكشن
prediction_script = """
import joblib
import pandas as pd

# حمل النماذج
model = joblib.load('model.joblib')
scaler = joblib.load('scalar.joblib')

X_test = pd.read_csv('X_test.csv')
X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

print("Predictions:")
print(y_pred)


try:
    y_test = pd.read_csv('y_test.csv')
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc * 100:.2f}%")
except FileNotFoundError:
    print("y_test.csv not found. Skipping accuracy.")
"""

with open("prediction.py", "w") as f:
    f.write(prediction_script)

print(" Created prediction.py")
