In [None]:
import os
import zipfile
import pandas as pd
from google.colab import drive

# **Data Loading**

## **Read train_data**

In [None]:
def read_csv_files_from_zip(zip_file_path):
    # Empty list to store tuples (DataFrame, file name)
    dfs = []

    # Mount Google Drive
    drive.mount('/content/drive')

    # ZIP file path in Google Drive
    zip_file_path_drive = '/content/drive/MyDrive/' + zip_file_path

    # Open the ZIP file
    with zipfile.ZipFile(zip_file_path_drive, 'r') as zip_ref:
        # Iterate over each file in the ZIP file
        for file_name in zip_ref.namelist():
            # Read the CSV file
            with zip_ref.open(file_name) as file:
                # Load into a DataFrame
                df = pd.read_csv(file)
                # Add DataFrame and file name as a tuple to the list
                dfs.append((df, file_name))

    return dfs

In [None]:
# ZIP file path in Google Drive
zip_file_path_drive = 'DataSource/train_data.zip'

# Read CSV files
dataframes = read_csv_files_from_zip(zip_file_path_drive)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


In [None]:
# Summary and file name of the first CSV file
first_df, first_file_name = dataframes[0]
print(f"First CSV File Name: {first_file_name}")
print("Summary of the First CSV File:")
print(first_df.head())
print("Length of train data: ", len(dataframes))

First CSV File Name: train_data/00033604-f1a5-417c-8928-20dcf9515220.csv
Summary of the First CSV File:
   SYSCALL_timestamp SYSCALL_arch SYSCALL_syscall SYSCALL_success  \
0                  0      aarch64            mmap             yes   
1                  0      aarch64           clone             yes   
2                  0      aarch64          munmap             yes   
3                  0      aarch64           close             yes   
4                  0      aarch64          execve             yes   

   SYSCALL_exit PROCESS_comm        PROCESS_exe  \
0  5.481085e+11      apache2  /usr/sbin/apache2   
1  6.184840e+05      apache2  /usr/sbin/apache2   
2  0.000000e+00      apache2  /usr/sbin/apache2   
3  0.000000e+00      apache2  /usr/sbin/apache2   
4  0.000000e+00           sh      /usr/bin/dash   

                         PROCESS_PATH CUSTOM_openFiles CUSTOM_libs  ...  \
0                    >systemd>apache2               []          []  ...   
1                    >sy

## **Read attack_log_files**

In [None]:
def read_attack_files(attack_files_path):
    # Mount Google Drive
    drive.mount('/content/drive')

    # Attack files path in Google Drive
    attack_files_path_drive = '/content/drive/MyDrive/' + attack_files_path

    # Read the list of attack files
    with open(attack_files_path_drive, 'r') as file:
        attack_files = file.read().splitlines()

    return attack_files

In [None]:
# ZIP file path in Google Drive
attack_files_path = 'DataSource/train_files_containing_attacks.txt'

# Read attack log files
attack_files = read_attack_files(attack_files_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print("Attack log files:")
print(attack_files)
print("Length of attack files: ", len(attack_files))

Attack log files:
['7e1cd5bf-cbe6-4fe6-a058-d7d6a48cc7a7', '5504d7c4-ae52-42c8-a11f-52c30a6f31af', '85763477-aeeb-4172-b992-c1546ea79df0', '8f03a607-3be1-4672-8e3b-360b67bc98f8', '731be796-6aea-450e-b27b-5b393ae59454', 'fabdc53f-1693-4b94-9307-630dc7637a13', '4ceaea10-740b-4c4f-9fd2-8ac993aa761d', 'dbf60467-8bc9-4e54-a8bb-968564695033', '08794406-035b-45f9-a6d2-6a06c4437f1f', 'a65a1a8a-8542-447c-adf7-6dbebf88827d', '7d999e9a-a22c-49f5-8561-21a197cf0141', '735776a6-d894-4750-9eef-dad852a5c18d', 'de2b8d5e-831f-4db2-af20-91b60cbc5f16', '3545d349-90e7-4f85-85c1-b6060de70628', '677f055c-a236-42be-bd75-3780f6973901', 'a3c9059e-d8c1-4927-8dcb-a31df6c2ee7c', '2c310eb0-92d3-4a36-b75d-116bde0539f6', 'b3f12643-07c5-4236-9223-c22c651a5f5d', '60e0a86d-1854-43fe-8ad6-f1399b2dee5f', '08fd3453-7996-455c-81d4-b5849012b843', '78f3e76e-95ae-4507-bee3-50e5600267e3', '380103e1-1dbc-475e-ab15-f92c8c163402', '21d471f0-3e5a-43f3-b4e9-763d8db0bab0', '24d274f1-27e7-4b14-bf2a-ec6b30f471c4', '287e7ea3-726f-4fcf-8

# **Data Cleaning**

In [None]:
def clean_csv_file(file_path):
    # Open the CSV file
    with open(file_path, 'r') as file:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)

        # Handle missing values
        df.dropna(inplace=True)

        # Data type conversion
        df['SYSCALL_timestamp'] = pd.to_datetime(df['SYSCALL_timestamp'], unit='s')  # Assuming timestamp is in seconds

        # Remove irrelevant columns
        irrelevant_columns = ['PROCESS_PATH', 'CUSTOM_openFiles', 'CUSTOM_libs', 'CUSTOM_openSockets',
                              'USER_ACTION_op', 'USER_ACTION_src', 'USER_ACTION_res', 'USER_ACTION_addr',
                              'PROCESS_name', 'KILL_process', 'KILL_uid']
        df.drop(columns=irrelevant_columns, inplace=True)

        # Encode categorical variables
        categorical_columns = ['SYSCALL_arch', 'SYSCALL_syscall', 'SYSCALL_success', 'PROCESS_comm', 'PROCESS_exe']
        df = pd.get_dummies(df, columns=categorical_columns)

    return df

In [None]:
def process_csv_files_from_zip(zip_file_path, attack_files_path):
    # Mount Google Drive
    drive.mount('/content/drive')

    # ZIP file path in Google Drive
    zip_file_path_drive = '/content/drive/MyDrive/' + zip_file_path

    # Open the ZIP file
    with zipfile.ZipFile(zip_file_path_drive, 'r') as zip_ref:
        # Iterate over each file in the ZIP file
        for file_name in zip_ref.namelist():
            # Check if the file is a CSV file
            if file_name.endswith('.csv'):
                # Extract the CSV file
                zip_ref.extract(file_name, '/content')  # Extract to temporary directory
                file_path = os.path.join('/content', file_name)

                # Clean the CSV file
                cleaned_df = clean_csv_file(file_path)

                # Process the cleaned data (placeholder)
                # Your processing logic here

                # Close the file
                os.remove(file_path)  # Remove the temporary file

    # Read attack log files
    attack_files = read_attack_files(attack_files_path)
    print("Attack log files:")
    print(attack_files)

In [None]:
# ZIP file path in Google Drive
zip_file_path_drive = 'DataSource/train_data.zip'
attack_files_path = 'DataSource/train_files_containing_attacks.txt'

In [None]:
# Process CSV files
process_csv_files_from_zip(zip_file_path_drive, attack_files_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv(file)
  df = pd.read_csv(file)


KeyboardInterrupt: 