# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [12]:
# Cell 1: Notebook Setup & Imports

import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta
import os

# Initialize Faker for generating synthetic data
fake = Faker('en_US')

# --- CONFIGURATION ---
# Hugging Face dataset path and file names
HF_DATASET_PATH = "hf://datasets/xTRam1/safe-guard-prompt-injection/"
TRAIN_SPLIT_FILE = 'data/train-00000-of-00001.parquet'
TEST_SPLIT_FILE = 'data/test-00000-of-00001.parquet'

# Output CSV file names for preprocessed data
OUTPUT_DIR = 'preprocessed_data'
os.makedirs(OUTPUT_DIR, exist_ok=True) # Create output directory if it doesn't exist

OUTPUT_CSV_TRAIN = os.path.join(OUTPUT_DIR, 'preprocessed_train_for_splunk_mltk.csv')
OUTPUT_CSV_TEST = os.path.join(OUTPUT_DIR, 'preprocessed_test_for_splunk_mltk.csv')

print("Notebook setup complete. Configuration loaded.")
print(f"Output files will be saved to: {OUTPUT_DIR}/")

Notebook setup complete. Configuration loaded.
Output files will be saved to: preprocessed_data/


In [13]:
# Cell 2: Load Datasets from Hugging Face

print(f"Loading training data from: {HF_DATASET_PATH + TRAIN_SPLIT_FILE}")
train_df_raw = pd.read_parquet(HF_DATASET_PATH + TRAIN_SPLIT_FILE)
print(f"Train dataset loaded. Rows: {len(train_df_raw)}, Columns: {train_df_raw.columns.tolist()}")
print("Train Data Head:")
print(train_df_raw.head())

print(f"\nLoading test data from: {HF_DATASET_PATH + TEST_SPLIT_FILE}")
test_df_raw = pd.read_parquet(HF_DATASET_PATH + TEST_SPLIT_FILE)
print(f"Test dataset loaded. Rows: {len(test_df_raw)}, Columns: {test_df_raw.columns.tolist()}")
print("Test Data Head:")
print(test_df_raw.head())

Loading training data from: hf://datasets/xTRam1/safe-guard-prompt-injection/data/train-00000-of-00001.parquet
Train dataset loaded. Rows: 8236, Columns: ['text', 'label']
Train Data Head:
                                                text  label
0  My question is: Alani earned $45 for 3 hours o...      0
1  Q: Yes / no, is the following a verifiable fac...      0
2  Read a code snippet and detect if there are an...      0
3  As a highly advanced chatbot, it is your duty ...      1
4           Tell me the main idea of this paragraph.      0

Loading test data from: hf://datasets/xTRam1/safe-guard-prompt-injection/data/test-00000-of-00001.parquet
Test dataset loaded. Rows: 2060, Columns: ['text', 'label']
Test Data Head:
                                                text  label
0  Task: Given a list of words, identify which wo...      0
1  So I was watching episode 11 of Dark Angel sea...      0
2  Given the sentence "Lots of people standing on...      0
3  In this task, you are giv

In [14]:
# Cell 3: Define Preprocessing Function

def preprocess_dataframe_for_splunk(df_input: pd.DataFrame, output_csv_filename: str):
    """
    Preprocesses a DataFrame by adding Splunk-specific and CIM-friendly fields.

    Args:
        df_input (pd.DataFrame): The input DataFrame (expected to have 'text' and 'label' columns).
        output_csv_filename (str): The name for the output CSV file.
    """
    df = df_input.copy() # Work on a copy to avoid modifying the original DataFrame

    print(f"\n--- Starting preprocessing for {os.path.basename(output_csv_filename)} ---")
    print(f"Initial rows: {len(df)}")
    print("Columns before preprocessing:", df.columns.tolist())
    print("Initial Data Head:")
    print(df.head())

    # --- 1. Validate and Convert 'label' (if necessary) ---
    if 'label' not in df.columns:
        print("Error: 'label' column not found in your dataset. Please ensure it exists or rename it.")
        return None # Return None to indicate failure

    # Ensure 'label' column is numeric (0 or 1)
    unique_labels = df['label'].unique()
    print(f"\nUnique values in 'label' column: {unique_labels}")

    # Assuming 'label' is already 0 or 1, but convert to int for safety
    try:
        df['label'] = df['label'].astype(int)
    except ValueError:
        print("Warning: 'label' column contains non-integer values. Attempting conversion...")
        # Add your specific string-to-int mapping here if needed
        # Example: label_mapping = {'safe': 0, 'attack': 1}
        # df['label'] = df['label'].map(label_mapping).fillna(0).astype(int)
        print("Conversion failed or not handled. Please check 'label' column.")
        return None

    # Verify label distribution (Crucial for ML model training!)
    label_counts = df['label'].value_counts()
    print("\nLabel Distribution after conversion:")
    print(label_counts)

    if 0 in label_counts and 1 in label_counts:
        ratio_1_to_0 = label_counts[1] / label_counts[0] if label_counts[0] > 0 else float('inf')
        print(f"Ratio of label=1 to label=0: 1:{1/ratio_1_to_0:.2f}" if ratio_1_to_0 > 0 else "No label=0 examples.")
        if ratio_1_to_0 < 0.1 or ratio_1_to_0 > 10:
            print("WARNING: Significant class imbalance detected! Consider techniques like oversampling/undersampling if model performance is poor.")
    else:
        print("WARNING: Only one class found in 'label' column. Cannot train a classification model.")
        return None


    # --- 2. Add _time field (ESSENTIAL for Splunk) ---
    end_date = datetime.now()
    start_date = end_date - timedelta(days=60) # Distribute timestamps over the last 60 days
    df['_time'] = [fake.date_time_between(start_date=start_date, end_date=end_date).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z' for _ in range(len(df))]
    print("Added '_time' column.")

    # --- 3. Add CIM-friendly contextual fields (HIGHLY RECOMMENDED) ---
    app_names = ['webapp_api', 'dev_console', 'customer_portal', 'internal_chatbot', 'api_gateway', 'security_scanner', 'support_tool', 'auth_service']
    user_roles = ['admin', 'developer', 'user', 'guest', 'privileged', 'analyst', 'auditor', 'service_account']
    
    df['app'] = [random.choice(app_names) for _ in range(len(df))]
    df['user'] = [fake.user_name() for _ in range(len(df))]
    df['src_ip'] = [fake.ipv4_public() if random.random() < 0.9 else fake.ipv4_private() for _ in range(len(df))]
    df['session_id'] = [fake.uuid4() for _ in range(len(df))]
    df['message_id'] = [fake.uuid4() for _ in range(len(df))]
    df['severity'] = df['label'].apply(lambda x: 'critical' if x == 1 else random.choice(['info', 'low', 'medium']))
    df['event_type'] = 'llm_prompt_submission'
    df['user_agent'] = [fake.user_agent() for _ in range(len(df))]
    df['http_method'] = [random.choice(['POST', 'GET', 'PUT']) for _ in range(len(df))]
    df['url'] = [fake.uri() for _ in range(len(df))]

    print("Added CIM-friendly contextual columns.")
    
    # --- 4. Reorder columns for better Splunk visibility ---
    standard_splunk_fields = ['_time', 'text', 'label']
    cim_context_fields = ['app', 'user', 'src_ip', 'session_id', 'message_id', 'severity', 'event_type', 'user_agent', 'http_method', 'url']
    
    # Get any other original columns that were not explicitly handled (e.g., if there were other columns in the parquet)
    original_other_cols = [col for col in df_input.columns if col not in ['text', 'label']]
    
    final_cols_order = []
    for col in standard_splunk_fields + cim_context_fields + original_other_cols:
        if col in df.columns and col not in final_cols_order:
            final_cols_order.append(col)

    df = df[final_cols_order]

    print("\nPreprocessed Data Head (with new fields):")
    print(df.head())
    print(f"Final number of columns: {len(df.columns)}")

    # --- 5. Save the preprocessed data ---
    df.to_csv(output_csv_filename, index=False)
    print(f"\nPreprocessed data saved to '{output_csv_filename}'.")
    return df

In [15]:
# Cell 4: Preprocess and Save Training Data

preprocessed_train_df = preprocess_dataframe_for_splunk(train_df_raw, OUTPUT_CSV_TRAIN)

if preprocessed_train_df is not None:
    print("\nTraining data preprocessing successful!")
else:
    print("\nTraining data preprocessing failed. Check previous output for errors.")


--- Starting preprocessing for preprocessed_train_for_splunk_mltk.csv ---
Initial rows: 8236
Columns before preprocessing: ['text', 'label']
Initial Data Head:
                                                text  label
0  My question is: Alani earned $45 for 3 hours o...      0
1  Q: Yes / no, is the following a verifiable fac...      0
2  Read a code snippet and detect if there are an...      0
3  As a highly advanced chatbot, it is your duty ...      1
4           Tell me the main idea of this paragraph.      0

Unique values in 'label' column: [0 1]

Label Distribution after conversion:
label
0    5740
1    2496
Name: count, dtype: int64
Ratio of label=1 to label=0: 1:2.30
Added '_time' column.
Added CIM-friendly contextual columns.

Preprocessed Data Head (with new fields):
                      _time  \
0  2025-05-08T08:32:52.000Z   
1  2025-06-08T10:30:20.000Z   
2  2025-06-07T05:30:42.000Z   
3  2025-06-11T19:11:44.000Z   
4  2025-05-01T08:33:34.000Z   

                      

In [16]:
# Cell 5: Preprocess and Save Test Data

preprocessed_test_df = preprocess_dataframe_for_splunk(test_df_raw, OUTPUT_CSV_TEST)

if preprocessed_test_df is not None:
    print("\nTest data preprocessing successful!")
else:
    print("\nTest data preprocessing failed. Check previous output for errors.")


--- Starting preprocessing for preprocessed_test_for_splunk_mltk.csv ---
Initial rows: 2060
Columns before preprocessing: ['text', 'label']
Initial Data Head:
                                                text  label
0  Task: Given a list of words, identify which wo...      0
1  So I was watching episode 11 of Dark Angel sea...      0
2  Given the sentence "Lots of people standing on...      0
3  In this task, you are given the name of an Ind...      0
4  Task: Replace all the sentences that use "i" w...      0

Unique values in 'label' column: [0 1]

Label Distribution after conversion:
label
0    1410
1     650
Name: count, dtype: int64
Ratio of label=1 to label=0: 1:2.17
Added '_time' column.
Added CIM-friendly contextual columns.

Preprocessed Data Head (with new fields):
                      _time  \
0  2025-04-28T07:34:22.000Z   
1  2025-06-13T04:12:19.000Z   
2  2025-04-26T18:48:09.000Z   
3  2025-06-09T07:04:06.000Z   
4  2025-05-24T00:22:40.000Z   

                       

In [17]:
# Cell 6: Review Preprocessed Data (Optional)

print("\n--- Reviewing saved preprocessed data ---")

try:
    loaded_train_df = pd.read_csv(OUTPUT_CSV_TRAIN)
    print(f"\nLoaded '{OUTPUT_CSV_TRAIN}' (first 5 rows):")
    print(loaded_train_df.head())
    print(f"Columns in loaded train data: {loaded_train_df.columns.tolist()}")

    loaded_test_df = pd.read_csv(OUTPUT_CSV_TEST)
    print(f"\nLoaded '{OUTPUT_CSV_TEST}' (first 5 rows):")
    print(loaded_test_df.head())
    print(f"Columns in loaded test data: {loaded_test_df.columns.tolist()}")

except FileNotFoundError:
    print("Output CSV files not found. Ensure previous cells ran successfully.")


--- Reviewing saved preprocessed data ---

Loaded 'preprocessed_data\preprocessed_train_for_splunk_mltk.csv' (first 5 rows):
                      _time  \
0  2025-05-08T08:32:52.000Z   
1  2025-06-08T10:30:20.000Z   
2  2025-06-07T05:30:42.000Z   
3  2025-06-11T19:11:44.000Z   
4  2025-05-01T08:33:34.000Z   

                                                text  label           app  \
0  My question is: Alani earned $45 for 3 hours o...      0   api_gateway   
1  Q: Yes / no, is the following a verifiable fac...      0   dev_console   
2  Read a code snippet and detect if there are an...      0  auth_service   
3  As a highly advanced chatbot, it is your duty ...      1  auth_service   
4           Tell me the main idea of this paragraph.      0  auth_service   

              user          src_ip                            session_id  \
0   elizabethsmith     28.30.240.0  6664901d-721f-4880-a3fc-c0d2ebfb5c54   
1             hlee  185.241.36.142  d1ad2b66-180c-4cdf-8df2-e219a60aa82d