In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import pickle



In [None]:

def unzip_file(zip_path, extract_path):
    """
    Extracts all files from a zip archive.

    Args:
        zip_path (str): The path to the zip file.
        extract_path (str): The directory to extract the files to. 
                           If None, it will extract to the current directory.
    """
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print(f"Successfully extracted '{zip_path}' to '{extract_path}'")
    except FileNotFoundError:
        print(f"Error: Zip file '{zip_path}' not found.")
    except zipfile.BadZipFile:
         print(f"Error: '{zip_path}' is not a valid zip file.")
    except Exception as e:
        print(f"An error occurred: {e}")
        
zip_file_path = '../Dataset/alarms_log_data.zip'
extract_location = 'dataset'
unzip_file(zip_file_path, extract_location)

Error: Zip file './Dataset/alarms_log_data.zip' not found.


In [None]:
df = pd.read_csv("dataset/alarms_log_data/raw/alarms.csv")
df

In [None]:

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)


In [None]:
df.info()

In [None]:
df['alarm'].value_counts()

In [None]:
df['serial'].value_counts()

In [None]:
# Change timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
df.head()


In [None]:
df = df.sort_values(by='timestamp')
df['timestamp'] = df['timestamp'].dt.floor('S')  # Round down to the nearest second
df.head()

In [None]:
df = df.reset_index(drop=True)
df.head()

# Feature Engineering

In [None]:
# Extract date-based features
df['date'] = df['timestamp'].dt.date
df['time'] = df['timestamp'].dt.time
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['second'] = df['timestamp'].dt.second
df['day_of_week'] = df['timestamp'].dt.day_name()
df['day_of_year'] = df['timestamp'].dt.dayofyear
df['week_of_year'] = df['timestamp'].dt.isocalendar().week
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year
df['quarter'] = df['timestamp'].dt.quarter

In [None]:
df.head()

In [None]:
# Encode weekend & weekday
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
df['is_weekday'] = df['day_of_week'].isin(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']).astype(int)

# Time-of-day encoding
df['is_morning'] = df['hour'].between(5, 11).astype(int)
df['is_afternoon'] = df['hour'].between(12, 17).astype(int)
df['is_evening'] = df['hour'].between(18, 21).astype(int)
df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 4)).astype(int)

In [None]:
df.head()

In [None]:
df.describe()