In [3]:
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as mlp
import seaborn as sns
import os

# Download latest version
path = kagglehub.dataset_download("luccagodoy/obfuscated-malware-memory-2022-cic")

print("Path to dataset files:", path)

csv_path = os.path.join(path, "Obfuscated-MalMem2022.csv")
df = pd.read_csv(csv_path)
print(df.head())




Using Colab cache for faster access to the 'obfuscated-malware-memory-2022-cic' dataset.
Path to dataset files: /kaggle/input/obfuscated-malware-memory-2022-cic
  Category  pslist.nproc  pslist.nppid  pslist.avg_threads  \
0   Benign            45            17           10.555556   
1   Benign            47            19           11.531915   
2   Benign            40            14           14.725000   
3   Benign            32            13           13.500000   
4   Benign            42            16           11.452381   

   pslist.nprocs64bit  pslist.avg_handlers  dlllist.ndlls  \
0                   0           202.844444           1694   
1                   0           242.234043           2074   
2                   0           288.225000           1932   
3                   0           264.281250           1445   
4                   0           281.333333           2067   

   dlllist.avg_dlls_per_proc  handles.nhandles  handles.avg_handles_per_proc  \
0                  

In [4]:
num_cols = df.select_dtypes(include=['number']).columns.tolist()

cat_cols = df.select_dtypes(include=['object']).columns.tolist()

print(num_cols)
print(cat_cols)


['pslist.nproc', 'pslist.nppid', 'pslist.avg_threads', 'pslist.nprocs64bit', 'pslist.avg_handlers', 'dlllist.ndlls', 'dlllist.avg_dlls_per_proc', 'handles.nhandles', 'handles.avg_handles_per_proc', 'handles.nport', 'handles.nfile', 'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread', 'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer', 'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load', 'ldrmodules.not_in_init', 'ldrmodules.not_in_mem', 'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg', 'ldrmodules.not_in_mem_avg', 'malfind.ninjections', 'malfind.commitCharge', 'malfind.protection', 'malfind.uniqueInjections', 'psxview.not_in_pslist', 'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool', 'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles', 'psxview.not_in_session', 'psxview.not_in_deskthrd', 'psxview.not_in_pslist_false_avg', 'psxview.not_in_eprocess_pool_false_avg', 'psxview.not_in_ethread_pool_false_avg', 'psxview.n

In [5]:
df.isna().sum()

Unnamed: 0,0
Category,0
pslist.nproc,0
pslist.nppid,0
pslist.avg_threads,0
pslist.nprocs64bit,0
pslist.avg_handlers,0
dlllist.ndlls,0
dlllist.avg_dlls_per_proc,0
handles.nhandles,0
handles.avg_handles_per_proc,0


In [6]:
df.duplicated().sum()

np.int64(534)

# `# PREPROCESSING + MODEL = PIPE`




In [7]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential
from sklearn.compose import ColumnTransformer

In [8]:
#standard_scaler = StandardScaler()
#df[num_cols] = standard_scaler.fit_transform(df[num_cols])

#label_encoder = LabelEncoder()
#df['Class'] = label_encoder.fit_transform(df['Class'])

In [10]:
X = df.drop(['Class','Category'], axis=1)
y = df['Class']

X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,random_state=42
)

label_encoder = LabelEncoder()
y_train,y_test = label_encoder.fit_transform(y_train),label_encoder.fit_transform(y_test)

In [11]:
Pipeline_XGBoost = Pipeline([
    ("preprocess", ColumnTransformer([
        ('scaler', StandardScaler(), num_cols)
    ])),
    ('classifier', xgb.XGBClassifier())
])

Pipeline_XGBoost.fit(X_train,y_train)

# `*#PREDICTION*`

In [12]:
from sklearn.metrics import accuracy_score

y_pred = Pipeline_XGBoost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
