In [5]:
# Step 1: Import necessary libraries and load the data from the ZIP file
import pandas as pd
import zipfile

zip_path = r"/content/archive (2).zip"
csv_file_name = "cybersecurity_intrusion_data.csv"

with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_file_name) as f:
        data = pd.read_csv(f)


In [6]:
# Step 2: Display basic information about the dataset
print(data.info())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9537 entries, 0 to 9536
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   session_id           9537 non-null   object 
 1   network_packet_size  9537 non-null   int64  
 2   protocol_type        9537 non-null   object 
 3   login_attempts       9537 non-null   int64  
 4   session_duration     9537 non-null   float64
 5   encryption_used      7571 non-null   object 
 6   ip_reputation_score  9537 non-null   float64
 7   failed_logins        9537 non-null   int64  
 8   browser_type         9537 non-null   object 
 9   unusual_time_access  9537 non-null   int64  
 10  attack_detected      9537 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 819.7+ KB
None
  session_id  network_packet_size protocol_type  login_attempts  \
0  SID_00001                  599           TCP               4   
1  SID_00002                  472       

In [7]:
# Step 3: Check for missing values and handle them
print(data.isnull().sum())
data = data.dropna()  # Drop rows with missing values (optional)


session_id                0
network_packet_size       0
protocol_type             0
login_attempts            0
session_duration          0
encryption_used        1966
ip_reputation_score       0
failed_logins             0
browser_type              0
unusual_time_access       0
attack_detected           0
dtype: int64


In [9]:
print(data.columns)


Index(['session_id', 'network_packet_size', 'protocol_type', 'login_attempts',
       'session_duration', 'encryption_used', 'ip_reputation_score',
       'failed_logins', 'browser_type', 'unusual_time_access',
       'attack_detected'],
      dtype='object')


In [10]:
# Encode the target column and any other categorical features
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode the target column
data['attack_detected'] = encoder.fit_transform(data['attack_detected'])

# Encode other categorical columns if needed (e.g., 'protocol_type', 'browser_type')
categorical_cols = ['protocol_type', 'browser_type']
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])


In [11]:
# Handle missing values by filling them with the mean or mode
data.fillna(data.mean(numeric_only=True), inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)


In [12]:
# Separate features (X) and target (y)
X = data.drop(columns=['attack_detected'])
y = data['attack_detected']


In [13]:
# Separate features (X) and target (y)
X = data.drop(columns=['attack_detected'])
y = data['attack_detected']


In [14]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
# Separate numeric and categorical columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# One-Hot Encode categorical columns (if not already encoded)
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Split into train and test sets again after encoding
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale only the numeric columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [17]:
# Train a Random Forest model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [18]:
# Evaluate the model on test data
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8912852112676056
Classification Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91      1260
           1       1.00      0.76      0.86      1012

    accuracy                           0.89      2272
   macro avg       0.92      0.88      0.89      2272
weighted avg       0.91      0.89      0.89      2272

