In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from datetime import datetime, timedelta

In [2]:
def preprocess_data(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Timestamp'] = df['Timestamp'].astype(np.int64) // 10**9
    return df

In [3]:
def random_sampler(df, target_size=10000):
    indices = np.random.choice(len(df), size=target_size, replace=True)
    return df.iloc[indices].reset_index(drop=True)

In [4]:
def gaussian_noise_sampling(df, target_size=10000, noise_factor=0.05):
    samples = []
    while len(samples) < target_size:
        idx = np.random.randint(0, len(df))
        sample = df.iloc[idx].copy()
        noise = np.random.normal(0, noise_factor, size=len(df.columns))
        for i, col in enumerate(df.columns):
            if col != 'Timestamp':
                sample[col] *= (1 + noise[i])
        samples.append(sample)
    return pd.DataFrame(samples, columns=df.columns)

In [5]:
def interpolation_sampling(df, target_size=10000):
    samples = []
    while len(samples) < target_size:
        idx1, idx2 = np.random.randint(0, len(df), size=2)
        alpha = np.random.random()

        sample = pd.Series(index=df.columns)
        for col in df.columns:
            if col == 'Timestamp':
                sample[col] = int(df.iloc[idx1][col] + alpha * (df.iloc[idx2][col] - df.iloc[idx1][col]))
            else:
                sample[col] = df.iloc[idx1][col] + alpha * (df.iloc[idx2][col] - df.iloc[idx1][col])
        samples.append(sample)
    return pd.DataFrame(samples)

In [6]:
def bootstrap_sampling(df, target_size=10000, window_size=5):
    samples = []
    while len(samples) < target_size:
        start_idx = np.random.randint(0, len(df) - window_size)
        window = df.iloc[start_idx:start_idx + window_size]

        new_sample = window.mean()
        noise = np.random.normal(0, 0.01, size=len(df.columns))
        for i, col in enumerate(df.columns):
            if col != 'Timestamp':
                new_sample[col] *= (1 + noise[i])

        samples.append(new_sample)
    return pd.DataFrame(samples)

In [7]:
def evaluate_correlation(original_df, generated_df, method_name):
    original_corr = original_df.corr()
    generated_corr = generated_df.corr()
    correlation_diff = abs(original_corr - generated_corr).mean().mean()
    print(f"{method_name} - Mean Correlation Difference: {correlation_diff}")
    return correlation_diff

In [8]:
def main():
    # Read original data
    df = pd.read_csv('/content/BABY.csv')
    df = preprocess_data(df)

    # Generate synthetic data
    random_df = random_sampler(df)
    gaussian_df = gaussian_noise_sampling(df)
    interp_df = interpolation_sampling(df)
    bootstrap_df = bootstrap_sampling(df)

    # Save generated datasets
    random_df.to_csv('BABY_randsam.csv', index=False)
    gaussian_df.to_csv('BABY_gaussian.csv', index=False)
    interp_df.to_csv('BABY_interp.csv', index=False)
    bootstrap_df.to_csv('BABY_bootstrap.csv', index=False)

    # Evaluate correlations
    scores = {
        'Random Sampling': evaluate_correlation(df, random_df, 'Random Sampling'),
        'Gaussian Noise': evaluate_correlation(df, gaussian_df, 'Gaussian Noise'),
        'Interpolation': evaluate_correlation(df, interp_df, 'Interpolation'),
        'Bootstrap': evaluate_correlation(df, bootstrap_df, 'Bootstrap')
    }

    best_method = min(scores.items(), key=lambda x: x[1])[0]
    print(f"\nBest performing method: {best_method}")

if __name__ == "__main__":
    main()

Random Sampling - Mean Correlation Difference: 0.002065961389369605
Gaussian Noise - Mean Correlation Difference: 0.1501243921456168
Interpolation - Mean Correlation Difference: 0.005398021380687992
Bootstrap - Mean Correlation Difference: 0.024399172272119844

Best performing method: Random Sampling
