In [1]:
import numpy as np
import pandas as pd
import pywt
from scipy import signal
#from PyEMD import EMD
from sklearn.decomposition import PCA, FastICA
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from python_speech_features import mfcc

In [2]:
def load_and_analyze_data(file_path):
    """Load data from CSV file and perform initial analysis."""
    print(f"Loading data from {file_path}...")
    data = pd.read_csv(file_path)
    print(f"Data loaded successfully. Shape: {data.shape}")

    # Display basic information about the dataset
    print("\nDataset Info:")
    print(data.info())

    # Check for missing values
    print("\nMissing Values:")
    print(data.isnull().sum())

    # Display basic statistics
    print("\nBasic Statistics:")
    print(data.describe())

    # Check for infinite values
    print("\nInfinite Values:")
    print(np.isinf(data.iloc[:, 1:]).sum())

    # Display first few rows
    print("\nFirst Few Rows:")
    print(data.head())

    return data

def clean_data(data, output_path='cleaned_dataset.csv'):
    """Clean the data by removing rows with NaN or infinite values and save the result."""
    original_shape = data.shape
    
    # Remove rows with NaN values
    data = data.dropna()
    
    # Remove rows with infinite values
    data = data[~np.isinf(data.iloc[:, 1:]).any(axis=1)]
    
    cleaned_shape = data.shape
    
    print(f"\nData Cleaning:")
    print(f"Original shape: {original_shape}")
    print(f"Cleaned shape: {cleaned_shape}")
    print(f"Rows removed: {original_shape[0] - cleaned_shape[0]}")
    
    # Save the cleaned dataset
    data.to_csv(output_path, index=False)
    print(f"Cleaned dataset saved to {output_path}")
    
    return data

def visualize_data_distribution(data):
    """Visualize the distribution of each feature."""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle("Data Distribution")
    
    for i, column in enumerate(data.columns[1:]):
        sns.histplot(data[column], kde=True, ax=axes[i//2, i%2])
        axes[i//2, i%2].set_title(column)
    
    plt.tight_layout()
    plt.savefig("data_distribution.png")
    plt.close()
    print("Data distribution visualization saved as 'data_distribution.png'")

def main():
    input_file = '../dataset/dataset.csv'
    cleaned_file = 'cleaned_dataset.csv'

    # Check if cleaned dataset already exists
    if os.path.exists(cleaned_file):
        print(f"Loading existing cleaned dataset from {cleaned_file}")
        data = pd.read_csv(cleaned_file)
    else:
        # Load and analyze data
        data = load_and_analyze_data(input_file)
    
        # Clean data
        data = clean_data(data, cleaned_file)
    
    # Visualize data
    visualize_data_distribution(data)

if __name__ == "__main__":
    main()

Loading existing cleaned dataset from cleaned_dataset.csv
Data distribution visualization saved as 'data_distribution.png'


In [3]:

# Global Variables for parameters
DWT_PARAMS = {'wavelet': 'db4', 'level': 3}
STFT_PARAMS = {'nperseg': 256, 'noverlap': 128}
EMD_PARAMS = {'max_imf': 10}
PCA_PARAMS = {'n_components': 2}
ICA_PARAMS = {'n_components': 2}
WPT_PARAMS = {'wavelet': 'db4', 'level': 3}
MFCC_PARAMS = {'n_mfcc': 13}

def load_data(file_path):
    """Load data from CSV file."""
    data = pd.read_csv(file_path)
    print(f"Data loaded successfully. Shape: {data.shape}")
    return data

def preprocess_data(data):
    """Preprocess the data."""
    # Normalize data
    data.iloc[:, 1:] = (data.iloc[:, 1:] - data.iloc[:, 1:].mean()) / data.iloc[:, 1:].std()
    print("Data preprocessing completed.")
    return data

def apply_dwt(data):
    """Apply Discrete Wavelet Transform."""
    transformed_data = []
    for col in data.columns[1:]:
        coeffs = pywt.wavedec(data[col], DWT_PARAMS['wavelet'], level=DWT_PARAMS['level'])
        transformed_data.append(np.concatenate(coeffs))
    print("DWT applied.")
    return np.array(transformed_data).T

def apply_stft(data):
    """Apply Short-Time Fourier Transform."""
    transformed_data = []
    for col in data.columns[1:]:
        f, t, Zxx = signal.stft(data[col], nperseg=STFT_PARAMS['nperseg'], noverlap=STFT_PARAMS['noverlap'])
        transformed_data.append(np.abs(Zxx).flatten())
    print("STFT applied.")
    return np.array(transformed_data).T

def apply_emd(data):
    """Apply Empirical Mode Decomposition."""
    transformed_data = []
    emd = EMD()
    for col in data.columns[1:]:
        imfs = emd(data[col], max_imf=EMD_PARAMS['max_imf'])
        transformed_data.append(imfs.flatten())
    print("EMD applied.")
    return np.array(transformed_data).T

def apply_pca(data):
    """Apply Principal Component Analysis."""
    pca = PCA(n_components=PCA_PARAMS['n_components'])
    transformed_data = pca.fit_transform(data.iloc[:, 1:])
    print("PCA applied.")
    return transformed_data

def apply_ica(data):
    """Apply Independent Component Analysis."""
    ica = FastICA(n_components=ICA_PARAMS['n_components'])
    transformed_data = ica.fit_transform(data.iloc[:, 1:])
    print("ICA applied.")
    return transformed_data

def apply_wpt(data):
    """Apply Wavelet Packet Transform."""
    transformed_data = []
    for col in data.columns[1:]:
        wp = pywt.WaveletPacket(data[col], wavelet=WPT_PARAMS['wavelet'], maxlevel=WPT_PARAMS['level'])
        coeffs = [node.data for node in wp.get_level(WPT_PARAMS['level'], 'natural')]
        transformed_data.append(np.concatenate(coeffs))
    print("WPT applied.")
    return np.array(transformed_data).T

def apply_s_transform(data):
    """Apply S-transform."""
    transformed_data = []
    for col in data.columns[1:]:
        f, t, st = signal.stft(data[col], nperseg=256)
        transformed_data.append(np.abs(st).flatten())
    print("S-transform applied.")
    return np.array(transformed_data).T

def apply_mfcc(data):
    """Apply Mel-Frequency Cepstral Coefficients."""
    transformed_data = []
    for col in data.columns[1:]:
        mfccs = mfcc(data[col].values, samplerate=1000, numcep=MFCC_PARAMS['n_mfcc'])
        transformed_data.append(mfccs.flatten())
    print("MFCC applied.")
    return np.array(transformed_data).T

def extract_features(data):
    """Extract statistical features."""
    features = []
    for col in data.columns[1:]:
        col_data = data[col]
        features.append([
            np.mean(col_data),
            np.std(col_data),
            skew(col_data),
            kurtosis(col_data),
            np.max(col_data),
            np.min(col_data)
        ])
    print("Statistical features extracted.")
    return np.array(features).T

def save_all_transformed_data(original_data, transformed_data_dict, output_file):
    """Save all transformed data with original labels into a single Excel file."""
    combined_data = original_data.copy()
    for method, data in transformed_data_dict.items():
        df = pd.DataFrame(data, columns=[f"{method}_{i}" for i in range(data.shape[1])])
        combined_data = pd.concat([combined_data, df], axis=1)
    
    # Save combined data to a single sheet
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        combined_data.to_excel(writer, sheet_name='Combined_Data', index=False)
        
        # Save parameters
        params_df = pd.DataFrame([
            {'Transform': 'DWT', 'Parameters': str(DWT_PARAMS)},
            {'Transform': 'STFT', 'Parameters': str(STFT_PARAMS)},
            {'Transform': 'PCA', 'Parameters': str(PCA_PARAMS)},
            {'Transform': 'ICA', 'Parameters': str(ICA_PARAMS)},
            {'Transform': 'WPT', 'Parameters': str(WPT_PARAMS)},
            {'Transform': 'MFCC', 'Parameters': str(MFCC_PARAMS)}
        ])
        params_df.to_excel(writer, sheet_name='Transform_Parameters', index=False)
    
    print(f"All transformed data saved to {output_file}")


def main():
    data = load_data('cleaned_dataset.csv')
    data = preprocess_data(data)
    
    # Apply transformations
    transformed_data = {
        'DWT': apply_dwt(data),
        'STFT': apply_stft(data),
        'PCA': apply_pca(data),
        'ICA': apply_ica(data),
        'WPT': apply_wpt(data),
        'S_Transform': apply_s_transform(data),
        'MFCC': apply_mfcc(data),
        # 'Statistical': extract_features(data)
    }
    
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"transformed_data_{timestamp}.xlsx"
    
    # Save all transformed data
    save_all_transformed_data(data, transformed_data, output_file)
    
    print("All transformations applied and saved successfully.")

if __name__ == "__main__":
    main()

Data loaded successfully. Shape: (479232, 4)
Data preprocessing completed.
DWT applied.
STFT applied.
PCA applied.
ICA applied.
WPT applied.
S-transform applied.
MFCC applied.
Statistical features extracted.
All transformed data saved to transformed_data_20240804_170250.xlsx
All transformations applied and saved successfully.
