In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [10]:
from urllib.request import urlretrieve
import os

if not os.path.exists('secom.data'):
    print("Downloading SECOM dataset...")
    urlretrieve('https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data', 'secom.data')
    urlretrieve('https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data', 'secom_labels.data')

data = pd.read_csv('secom.data', sep='\s+', header=None)
labels = pd.read_csv('secom_labels.data', sep='\s+', header=None)

print(f"Data shape: {data.shape}")
print(f"Labels shape: {labels.shape}")

constant_mask = data.nunique() == 1
constant_cols = data.columns[constant_mask]
print(f"Constant columns to remove: {len(constant_cols)}")

if len(constant_cols) > 0:
    data = data.drop(columns=constant_cols)

print(f"Shape after removal: {data.shape}")

print(f"NaN values before imputation: {data.isna().sum().sum()}")
data = data.apply(lambda x: x.fillna(x.median()), axis=0)
print(f"NaN values after imputation: {data.isna().sum().sum()}")

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)

print(f"Final data shape: {data_scaled.shape}")
print(f"Data range - Min: {data_scaled.min().min():.4f}, Max: {data_scaled.max().max():.4f}")

Data shape: (1567, 590)
Labels shape: (1567, 2)
Constant columns to remove: 116
Shape after removal: (1567, 474)
NaN values before imputation: 41136
NaN values after imputation: 0
Final data shape: (1567, 474)
Data range - Min: -37.9235, Max: 39.5727
