# Data Preperation

Code authored by: Shaw Talebi

### imports

In [1]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

### load data

In [2]:
# load data from HF hub
ds = load_dataset("logfit-project/HDFS_v1")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['line_number', 'date', 'time', 'pid', 'level', 'component', 'content', 'block_id', 'anomaly'],
        num_rows: 11175629
    })
})

### transform to block level

In [4]:
# convert to pandas df
df = ds['train'].to_pandas()
print(f"Original shape: {df.shape}")
df.head()

Original shape: (11175629, 9)


Unnamed: 0,line_number,date,time,pid,level,component,content,block_id,anomaly
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,blk_-1608999687919862906,0
1,2,81109,203518,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.allocateBlock: /mnt/hadoop/m...,blk_-1608999687919862906,0
2,3,81109,203519,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,blk_-1608999687919862906,0
3,4,81109,203519,145,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,blk_-1608999687919862906,0
4,5,81109,203519,145,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_-1608999687919...,blk_-1608999687919862906,0


In [5]:
# format each line as "<LEVEL> <COMPONENT>: <CONTENT>"
df['formatted'] = df['level'] + ' ' + df['component'] + ': ' + df['content']

# sort by line_number to ensure correct order, then aggregate by block_id
df = df.sort_values(['block_id', 'line_number'])

df_blocks = df.groupby('block_id').agg(
    text=('formatted', lambda x: '\n'.join(x)),
    label=('anomaly', 'max')
).reset_index()

print(f"Block-level dataset shape: {df_blocks.shape}")
print(f"\nLabel distribution:")
print(df_blocks['label'].value_counts())
df_blocks.head()

Block-level dataset shape: (575061, 3)

Label distribution:
label
0    558223
1     16838
Name: count, dtype: int64


Unnamed: 0,block_id,text,label
0,blk_-1000002529962039464,INFO dfs.DataNode$DataXceiver: Receiving block...,0
1,blk_-100000266894974466,INFO dfs.FSNamesystem: BLOCK* NameSystem.alloc...,0
2,blk_-1000007292892887521,INFO dfs.DataNode$DataXceiver: Receiving block...,0
3,blk_-1000014584150379967,INFO dfs.DataNode$DataXceiver: Receiving block...,0
4,blk_-1000028658773048709,INFO dfs.DataNode$DataXceiver: Receiving block...,0


### see example

In [6]:
sample_block = df_blocks.iloc[0]

print(f"Block ID: {sample_block['block_id']}")
print(f"Label: {sample_block['label']}")
print(f"\nConcatenated text ({len(sample_block['text'].split(chr(10)))} lines):")
print(sample_block['text'][:2000] + "..." if len(sample_block['text']) > 2000 else sample_block['text'])

Block ID: blk_-1000002529962039464
Label: 0

Concatenated text (13 lines):
INFO dfs.DataNode$DataXceiver: Receiving block blk_-1000002529962039464 src: /10.251.123.1:41333 dest: /10.251.123.1:50010
INFO dfs.DataNode$DataXceiver: Receiving block blk_-1000002529962039464 src: /10.251.123.1:53174 dest: /10.251.123.1:50010
INFO dfs.DataNode$DataXceiver: Receiving block blk_-1000002529962039464 src: /10.251.202.181:32980 dest: /10.251.202.181:50010
INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/rand8/_temporary/_task_200811101024_0015_m_001261_0/part-01261. blk_-1000002529962039464
INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-1000002529962039464 terminating
INFO dfs.DataNode$PacketResponder: Received block blk_-1000002529962039464 of size 3553241 from /10.251.123.1
INFO dfs.DataNode$PacketResponder: PacketResponder 0 for block blk_-1000002529962039464 terminating
INFO dfs.DataNode$PacketResponder: Received block blk_-1000002529962039464 of size 355

### create train-dev-test split

In [7]:
# Stratified 80/10/10 split using pandas
def stratified_split(df, train_frac=0.8, dev_frac=0.1, seed=42):
    train_parts, dev_parts, test_parts = [], [], []
    
    for label in df['label'].unique():
        group = df[df['label'] == label].sample(frac=1, random_state=seed)
        n = len(group)
        train_end = int(n * train_frac)
        dev_end = int(n * (train_frac + dev_frac))
        
        train_parts.append(group.iloc[:train_end])
        dev_parts.append(group.iloc[train_end:dev_end])
        test_parts.append(group.iloc[dev_end:])
    
    return pd.concat(train_parts), pd.concat(dev_parts), pd.concat(test_parts)

train_df, dev_df, test_df = stratified_split(df_blocks)

print(f"Train: {len(train_df)} ({len(train_df)/len(df_blocks)*100:.1f}%)")
print(f"Dev:   {len(dev_df)} ({len(dev_df)/len(df_blocks)*100:.1f}%)")
print(f"Test:  {len(test_df)} ({len(test_df)/len(df_blocks)*100:.1f}%)")

print(f"\nLabel distribution:")
print(f"Train - Normal: {(train_df['label']==0).sum()}, Anomaly: {(train_df['label']==1).sum()}")
print(f"Dev   - Normal: {(dev_df['label']==0).sum()}, Anomaly: {(dev_df['label']==1).sum()}")
print(f"Test  - Normal: {(test_df['label']==0).sum()}, Anomaly: {(test_df['label']==1).sum()}")

Train: 460048 (80.0%)
Dev:   57506 (10.0%)
Test:  57507 (10.0%)

Label distribution:
Train - Normal: 446578, Anomaly: 13470
Dev   - Normal: 55822, Anomaly: 1684
Test  - Normal: 55823, Anomaly: 1684


### push to HF hub

In [8]:
# Convert pandas DataFrames to HuggingFace Datasets
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df, preserve_index=False),
    'dev': Dataset.from_pandas(dev_df, preserve_index=False),
    'test': Dataset.from_pandas(test_df, preserve_index=False)
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['block_id', 'text', 'label'],
        num_rows: 460048
    })
    dev: Dataset({
        features: ['block_id', 'text', 'label'],
        num_rows: 57506
    })
    test: Dataset({
        features: ['block_id', 'text', 'label'],
        num_rows: 57507
    })
})

In [9]:
# Push to HuggingFace Hub (requires: huggingface-cli login)
dataset_dict.push_to_hub("shawhin/HDFS_v1_blocks")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/shawhin/HDFS_v1_blocks/commit/fb0d0c9ee4dffe4455d2d0b0abe311e7739cd18f', commit_message='Upload dataset', commit_description='', oid='fb0d0c9ee4dffe4455d2d0b0abe311e7739cd18f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/shawhin/HDFS_v1_blocks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='shawhin/HDFS_v1_blocks'), pr_revision=None, pr_num=None)