# Initial Exploration of the parsed BGL logs

In [4]:
# Set display option to show the full length of the column
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Raw Data

In [5]:
import os
import gzip

# Determine the base directory based on the effective user ID
if os.geteuid() == 0:  # root user
    base_dir = '/root'
else:
    base_dir = '/home/sagemaker-user'

# Set the environment variable
os.environ['BASE_DIR'] = base_dir

# Verify the base directory
print(f"Base directory set to: {base_dir}")

# Define the file path
file_path = os.path.join(base_dir, '11.Data/01.BGL/01.Raw_CFDR/bgl2.gz')

# Load and read the first 10 lines of the data
with gzip.open(file_path, 'rt') as f:
    lines = [next(f).strip() for _ in range(10)]

# Print the first 10 lines to understand the structure
for i, line in enumerate(lines, start=1):
    print(f"Line {i}: {line}")


Base directory set to: /root
Line 1: - 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.363779 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected
Line 2: - 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.527847 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected
Line 3: - 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.675872 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected
Line 4: - 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.823719 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected
Line 5: - 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.982731 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected
Line 6: - 1117838571 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.51.131467 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected
Line 7: - 1117838571 2005.06.03 R02

## Template File

In [6]:
import os

# Determine the base directory based on the effective user ID
if os.geteuid() == 0:  # root user
    base_dir = '/root'
else:
    base_dir = '/home/sagemaker-user'

# Set the environment variable
os.environ['BASE_DIR'] = base_dir

# Verify the base directory
print(f"Base directory set to: {base_dir}")

# Define the file path for BGL_templates.csv
template_file_path = os.path.join(base_dir, '08.GIT_Repos_REF/02.ait-aecid/templates/BGL_templates.csv')

# Load and read the first 10 lines of the template file
with open(template_file_path, 'r') as f:
    lines = [next(f).strip() for _ in range(10)]

# Print the first 10 lines to understand the structure
for i, line in enumerate(lines, start=1):
    print(f"Line {i}: {line}")

Base directory set to: /root
Line 1: <*>:<*> <*>:<*> <*>:<*> <*>:<*>
Line 2: <*> <*> <*> BGLERR_IDO_PKT_TIMEOUT connection lost to node/link/service card
Line 3: <*> correctable errors exceeds threshold (iar <*> lr <*>
Line 4: <*> ddr error(s) detected and corrected on rank <*> symbol <*> over <*> seconds
Line 5: <*> ddr errors(s) detected and corrected on rank <*> symbol <*>, bit <*>
Line 6: <*> double-hummer alignment exceptions
Line 7: <*> exited abnormally due to signal: Aborted
Line 8: <*> exited normally with exit code <*>
Line 9: <*> floating point alignment exceptions
Line 10: <*> L3 <*> error(s) (dcr <*> detected and corrected over <*> seconds


In [7]:
import os
import pandas as pd

# Determine the base directory based on the effective user ID
if os.geteuid() == 0:  # root user
    base_dir = '/root'
else:
    base_dir = '/home/sagemaker-user'

# Set the environment variable
os.environ['BASE_DIR'] = base_dir

# Verify the base directory
print(f"Base directory set to: {base_dir}")

# Define the file path for BGL_templates.csv
template_file_path = os.path.join(base_dir, '08.GIT_Repos_REF/02.ait-aecid/templates/BGL_templates.csv')

# Read the file line by line into a list
with open(template_file_path, 'r') as file:
    lines = file.readlines()

# Create a DataFrame from the list
BGL_templates_df = pd.DataFrame(lines, columns=['template'])

# Display the first few lines of the DataFrame to verify the content
print(BGL_templates_df.head())


Base directory set to: /root
                                                                            template
0                                                  <*>:<*> <*>:<*> <*>:<*> <*>:<*>\n
1     <*> <*> <*> BGLERR_IDO_PKT_TIMEOUT connection lost to node/link/service card\n
2                         <*> correctable errors exceeds threshold (iar <*> lr <*>\n
3  <*> ddr error(s) detected and corrected on rank <*> symbol <*> over <*> seconds\n
4         <*> ddr errors(s) detected and corrected on rank <*> symbol <*>, bit <*>\n


In [19]:
# Set display option to show the full length of the column
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)


BGL_templates_df.head(200)

Unnamed: 0,template
0,<*>:<*> <*>:<*> <*>:<*> <*>:<*>\n
1,<*> <*> <*> BGLERR_IDO_PKT_TIMEOUT connection lost to node/link/service card\n
2,<*> correctable errors exceeds threshold (iar <*> lr <*>\n
3,<*> ddr error(s) detected and corrected on rank <*> symbol <*> over <*> seconds\n
4,"<*> ddr errors(s) detected and corrected on rank <*> symbol <*>, bit <*>\n"
5,<*> double-hummer alignment exceptions\n
6,<*> exited abnormally due to signal: Aborted\n
7,<*> exited normally with exit code <*>\n
8,<*> floating point alignment exceptions\n
9,<*> L3 <*> error(s) (dcr <*> detected and corrected over <*> seconds\n


## Parsed Data

In [8]:
import pandas as pd

# Define the file path
# parsed_file_path = '/home/sagemaker-user/11.Data/01.BGL/03.Parsed_CFDR/parsed.csv'
parsed_file_path = '/root/11.Data/01.BGL/03.Parsed_CFDR/parsed.csv'

# Load the data into a DataFrame
df = pd.read_csv(parsed_file_path, sep=';')

# Display basic information about the DataFrame
print("Basic Information:")
print(df.info())

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4747963 entries, 0 to 4747962
Data columns (total 6 columns):
 #   Column      Dtype  
---  ------      -----  
 0   id          int64  
 1   event_type  int64  
 2   seq_id      object 
 3   time        float64
 4   label       object 
 5   eventlabel  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 217.3+ MB
None


In [9]:
# Display the first few lines of the DataFrame
print("\nFirst few lines:")
print(df.head())


First few lines:
   id  event_type               seq_id          time    label eventlabel
0   1         189  R02-M1-N0-C:J12-U11  1.117813e+09  Anomaly     Normal
1   2         189  R02-M1-N0-C:J12-U11  1.117813e+09  Anomaly     Normal
2   3         189  R02-M1-N0-C:J12-U11  1.117813e+09  Anomaly     Normal
3   4         189  R02-M1-N0-C:J12-U11  1.117813e+09  Anomaly     Normal
4   5         189  R02-M1-N0-C:J12-U11  1.117813e+09  Anomaly     Normal


In [10]:
df.head(20)

Unnamed: 0,id,event_type,seq_id,time,label,eventlabel
0,1,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
1,2,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
2,3,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
3,4,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
4,5,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
5,6,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
6,7,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
7,8,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
8,9,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal
9,10,189,R02-M1-N0-C:J12-U11,1117813000.0,Anomaly,Normal


In [11]:
# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())


Summary Statistics:
                 id    event_type          time
count  4.747963e+06  4.747963e+06  4.747963e+06
mean   2.373982e+06  1.271027e+02  1.123975e+09
std    1.370619e+06  8.594136e+01  5.226260e+06
min    1.000000e+00  1.000000e+00  1.117813e+09
25%    1.186992e+06  3.800000e+01  1.120121e+09
50%    2.373982e+06  1.680000e+02  1.121574e+09
75%    3.560972e+06  1.680000e+02  1.128723e+09
max    4.747963e+06  4.010000e+02  1.136362e+09


In [12]:
# Count unique values in each column
print("\nUnique Values Count:")
print(df.nunique())


Unique Values Count:
id            4747963
event_type        394
seq_id          69251
time          4747955
label               2
eventlabel         44
dtype: int64
