In [11]:
from google.colab import files
uploaded = files.upload()

Saving short_main.dat to short_main.dat
Saving big_main.dat to big_main (1).dat


# Load the datasets

In [16]:
import pandas as pd

# load small dataset
print("Short dataset:\n")
short_df = pd.read_csv("short_main.dat", sep=" ", header=None, names=["timestamp", "active_power", "apparent_power", "mains_voltage"])
print(short_df)
print("Short dataset shape:", short_df.shape)
print("\n")

# load big data
print("Big dataset:\n")
big_df = pd.read_csv("big_main.dat", sep=" ", header=None, names=["timestamp", "active_power", "apparent_power", "mains_voltage"])
print(big_df)
print("Big dataset shape:", big_df.shape)


Short dataset:

          timestamp  active_power  apparent_power  mains_voltage
0      1.404059e+09        702.57          805.69         243.69
1      1.404059e+09        697.15          803.02         243.61
2      1.404059e+09        689.04          797.65         243.62
3      1.404059e+09        693.41          801.12         243.71
4      1.404059e+09        701.05          803.49         243.75
...             ...           ...             ...            ...
50701  1.404110e+09       1435.88         1524.99         239.34
50702  1.404110e+09       1434.17         1523.25         239.41
50703  1.404110e+09       2111.41         2281.27         239.21
50704  1.404110e+09       1571.57         1704.69         239.31
50705  1.404110e+09       1431.11         1520.67         239.23

[50706 rows x 4 columns]
Short dataset shape: (50706, 4)


Big dataset:

           timestamp  active_power  apparent_power  mains_voltage
0       1.404059e+09        702.57          805.69         243.6

# Clean the data

In [17]:
# Remove missing values (NaN)
short_df = short_df.dropna()
big_df = big_df.dropna()

# Remove duplicate rows
short_df = short_df.drop_duplicates()
big_df = big_df.drop_duplicates()

print("Short dataset cleaned:", short_df.shape)
print("Big dataset cleaned:", big_df.shape)


Short dataset cleaned: (50706, 4)
Big dataset cleaned: (522117, 4)


# Normalize values

In [18]:
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

# Apply normalization column by column
short_df["active_power_norm"]   = normalize(short_df["active_power"])
short_df["apparent_power_norm"] = normalize(short_df["apparent_power"])
short_df["mains_voltage_norm"]  = normalize(short_df["mains_voltage"])

big_df["active_power_norm"]   = normalize(big_df["active_power"])
big_df["apparent_power_norm"] = normalize(big_df["apparent_power"])
big_df["mains_voltage_norm"]  = normalize(big_df["mains_voltage"])

print(short_df[["active_power_norm", "apparent_power_norm", "mains_voltage_norm"]])
print(big_df[["active_power_norm", "apparent_power_norm", "mains_voltage_norm"]])


       active_power_norm  apparent_power_norm  mains_voltage_norm
0               0.104577             0.088793            0.779817
1               0.103051             0.088015            0.770642
2               0.100767             0.086452            0.771789
3               0.101998             0.087462            0.782110
4               0.104149             0.088152            0.786697
...                  ...                  ...                 ...
50701           0.311065             0.298170            0.280963
50702           0.310583             0.297664            0.288991
50703           0.501283             0.518312            0.266055
50704           0.349273             0.350478            0.277523
50705           0.309722             0.296913            0.268349

[50706 rows x 3 columns]
        active_power_norm  apparent_power_norm  mains_voltage_norm
0                0.095454             0.105530            0.866271
1                0.094434             0.105027  

# Filter the signal

In [20]:
from scipy.signal import butter, filtfilt

# Low-pass filter to remove high-frequency noise
def lowpass_filter(data, cutoff=50, fs=16000, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data)

# Apply filter to normalized active power
short_df["filtered_active_power"] = lowpass_filter(short_df["active_power_norm"])
big_df["filtered_active_power"] = lowpass_filter(big_df["active_power_norm"])

short_df["filtered_apparent_power"] = lowpass_filter(short_df["apparent_power_norm"])
big_df["filtered_apparent_power"] = lowpass_filter(big_df["apparent_power_norm"])

short_df["filtered_mains_voltage"] = lowpass_filter(short_df["mains_voltage_norm"])
big_df["filtered_mains_voltage"] = lowpass_filter(big_df["mains_voltage_norm"])

print(short_df[["filtered_active_power", "filtered_apparent_power", "filtered_mains_voltage"]])
print(big_df[["filtered_active_power", "filtered_apparent_power", "filtered_mains_voltage"]])

       filtered_active_power  filtered_apparent_power  filtered_mains_voltage
0                   0.167526                 0.153661                0.750772
1                   0.168909                 0.155073                0.750421
2                   0.170296                 0.156490                0.750066
3                   0.171686                 0.157910                0.749706
4                   0.173081                 0.159334                0.749342
...                      ...                      ...                     ...
50701               0.139830                 0.130285                0.277293
50702               0.139830                 0.130285                0.277293
50703               0.139830                 0.130285                0.277293
50704               0.139830                 0.130285                0.277293
50705               0.139830                 0.130286                0.277293

[50706 rows x 3 columns]
        filtered_active_power  filtere

# Segment into windows



In [28]:
import numpy as np

# Function to split data into fixed-size windows
def segment_windows(series, window_size=100):
    total_samples = len(series)
    num_windows = total_samples // window_size   # integer division
    return series.values[:num_windows*window_size].reshape(num_windows, window_size)


# Segment both datasets into 100-sample windows
short_active_windows   = segment_windows(short_df["filtered_active_power"])
big_active_windows     = segment_windows(big_df["filtered_active_power"])

short_apparent_windows = segment_windows(short_df["filtered_apparent_power"])
big_apparent_windows   = segment_windows(big_df["filtered_apparent_power"])

short_voltage_windows  = segment_windows(short_df["filtered_mains_voltage"])
big_voltage_windows    = segment_windows(big_df["filtered_mains_voltage"])

print("First Active Power Window:\n", short_active_windows)
print("First Apparent Power Window:\n", short_apparent_windows)
print("First Voltage Window:\n", short_voltage_windows)

print("Active power windows:", short_active_windows.shape, big_active_windows.shape)
print("Apparent power windows:", short_apparent_windows.shape, big_apparent_windows.shape)
print("Voltage windows:", short_voltage_windows.shape, big_voltage_windows.shape)



First Active Power Window:
 [[0.16752604 0.16890894 0.17029581 ... 0.28770047 0.28850211 0.28929103]
 [0.2900672  0.29083059 0.29158118 ... 0.31239806 0.31223632 0.31207031]
 [0.31190016 0.31172595 0.3115478  ... 0.28868437 0.28847065 0.28825858]
 ...
 [0.07526465 0.07531891 0.07537601 ... 0.09852966 0.09894147 0.09935501]
 [0.09977017 0.10018688 0.10060503 ... 0.13412244 0.13431268 0.13449876]
 [0.13468072 0.13485856 0.13503233 ... 0.13982878 0.1398291  0.13982936]]
First Apparent Power Window:
 [[0.15366064 0.15507322 0.15648981 ... 0.27610878 0.2769209  0.27771995]
 [0.27850591 0.27927875 0.28003846 ... 0.30052105 0.30034912 0.30017293]
 [0.29999261 0.29980827 0.29962001 ... 0.27662174 0.27642105 0.27622239]
 ...
 [0.067455   0.06750449 0.06755666 ... 0.0897641  0.09016445 0.09056656]
 [0.09097035 0.09137574 0.09178263 ... 0.12463872 0.12482654 0.12501027]
 [0.12518995 0.12536559 0.12553721 ... 0.13028417 0.13028448 0.13028474]]
First Voltage Window:
 [[0.75077151 0.75042091 0.75006

# Feature extraction

In [36]:
import numpy as np

# Fonction d'extraction : mean, median, variance
def extract_features(window):
    mean = float(np.mean(window))
    median = float(np.median(window))
    var = float(np.var(window))
    return mean, median, var

# Appliquer à toutes les fenêtres pour un signal
def extract_all_features(windows, label):
    features = []
    for i, w in enumerate(windows):
        mean, median, var = extract_features(w)
        features.append({
            "window": i,
            "signal": label,
            "mean": mean,
            "median": median,
            "variance": var
        })
    return features

# Initialiser les listes
short_features_Active_Power   = []
short_features_Apparent_Power = []
short_features_Voltage        = []

big_features_Active_Power     = []
big_features_Apparent_Power   = []
big_features_Voltage          = []

# Features pour le dataset court
short_features_Active_Power   += extract_all_features(short_active_windows, "Active Power")
short_features_Apparent_Power += extract_all_features(short_apparent_windows, "Apparent Power")
short_features_Voltage        += extract_all_features(short_voltage_windows, "Voltage")

# Features pour le dataset long
big_features_Active_Power     += extract_all_features(big_active_windows, "Active Power")
big_features_Apparent_Power   += extract_all_features(big_apparent_windows, "Apparent Power")
big_features_Voltage          += extract_all_features(big_voltage_windows, "Voltage")

# Exemple : afficher les 5 premières lignes de chaque
print(short_features_Active_Power)
print(short_features_Apparent_Power)
print(short_features_Voltage)

print(big_features_Active_Power)
print(big_features_Apparent_Power)
print(big_features_Voltage)


[{'window': 0, 'signal': 'Active Power', 'mean': 0.2335900865906784, 'median': 0.23637441503441092, 'variance': 0.0013279905845116754}, {'window': 1, 'signal': 'Active Power', 'mean': 0.30888808784765875, 'median': 0.31244676678778166, 'variance': 4.879091686097245e-05}, {'window': 2, 'signal': 'Active Power', 'mean': 0.30026897259784563, 'median': 0.3003178850163846, 'variance': 5.1257834357087415e-05}, {'window': 3, 'signal': 'Active Power', 'mean': 0.28018539307618817, 'median': 0.2796844838967387, 'variance': 1.5804582303096065e-05}, {'window': 4, 'signal': 'Active Power', 'mean': 0.2707850505216769, 'median': 0.2704615717715353, 'variance': 2.7104772406357988e-06}, {'window': 5, 'signal': 'Active Power', 'mean': 0.26668955120222004, 'median': 0.2670046393264498, 'variance': 1.968877464649264e-06}, {'window': 6, 'signal': 'Active Power', 'mean': 0.2593316580148506, 'median': 0.2592105547924829, 'variance': 5.967532281196659e-06}, {'window': 7, 'signal': 'Active Power', 'mean': 0.25