In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\The Best\Desktop\Projects\Microsoft Cyber\Raw data\GUIDE_Test.csv", low_memory=False)

# Step 2: Handling Missing Data
# Check for missing values
missing_data = df.isnull().sum()
print("Missing values in each column:\n", missing_data)

# Strategy 1: Remove columns/rows with high missing data if they are unimportant
threshold = 0.4  # If more than 40% of a column is missing, drop it
df = df[df.columns[df.isnull().mean() < threshold]]

# Strategy 2: Impute missing values for numerical columns with mean/median
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

# Strategy 3: Impute missing values for categorical columns with the mode
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Step 3: Feature Engineering
# Example: Extracting date-based features from a timestamp column
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')  # Convert to datetime
df['Hour'] = df['Timestamp'].dt.hour  # Extract hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek  # Extract day of the week
df['Month'] = df['Timestamp'].dt.month  # Extract month

# Step 4: Encoding Categorical Variables
# Option 1: Label Encoding for ordinal or binary categorical variables
label_encoder = LabelEncoder()
df['IncidentGrade'] = label_encoder.fit_transform(df['IncidentGrade'])

# Option 2: One-Hot Encoding for nominal categorical variables
df = pd.get_dummies(df, columns=['Category', 'Usage'], drop_first=True)

# Step 5: Normalization (optional)
# You may want to normalize numerical columns to a common scale
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Step 6: Save the preprocessed dataset
df.to_csv(r"C:\Users\The Best\Desktop\Projects\Microsoft Cyber\Documents\GUIDE_Test_Preprocessed.csv", index=False)

print("Preprocessing completed. Preprocessed data saved to CSV.")



Missing values in each column:
 Id                          0
OrgId                       0
IncidentId                  0
AlertId                     0
Timestamp                   0
DetectorId                  0
AlertTitle                  0
Category                    0
MitreTechniques       2307104
IncidentGrade               0
ActionGrouped         4146079
ActionGranular        4146079
EntityType                  0
EvidenceRole                0
DeviceId                    0
Sha256                      0
IpAddress                   0
Url                         0
AccountSid                  0
AccountUpn                  0
AccountObjectId             0
AccountName                 0
DeviceName                  0
NetworkMessageId            0
EmailClusterId        4106285
RegistryKey                 0
RegistryValueName           0
RegistryValueData           0
ApplicationId               0
ApplicationName             0
OAuthApplicationId          0
ThreatFamily          4116614
FileName

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Preprocessing completed. Preprocessed data saved to CSV.
