# Step 1: Data Loading 

In [1]:
def split_data(scaled_data, train_ratio=0.6):
    # Extract features and labels for stratified splitting
    features = scaled_data[:, :-2]
    labels = df.iloc[:, -2:]  # assuming D and phi are the last two columns
    
    # Create a single label for stratified splitting
    strat_labels = labels.astype(str).agg('-'.join, axis=1)
    
    # Split the data
    train_indices, test_indices = train_test_split(np.arange(len(features)), test_size=0.4, stratify=strat_labels, random_state=42)
    train = scaled_data[train_indices]
    test = scaled_data[test_indices]
    return train, test

In [5]:
scaled_data = preprocess_data2(df)

In [6]:
scaled_data.shape

(699, 1313)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from utils import load_data, preprocess_data, preprocess_data2, reshape_data

fileloc = "/Users/joshhuang/MATLAB/ONR/Localization/"
#filename = "36_Locations_2.csv"
filename = "35_Locations_1.csv"

# Load and preprocess the data
df = load_data(fileloc, filename)
scaled_data = preprocess_data2(df)
train, test = split_data(scaled_data)
train_x, train_y, test_x, test_y = reshape_data(train, test)

# Define the mapping from distances and angles to labels
points = np.array([
    [-10, 0], [-8, 0], [-6, 0], [-4, 0], [-2, 0],
    [-10, -2], [-8, -2], [-6, -2], [-4, -2], [-2, -2], [0, -2],
    [-10, -4], [-8, -4], [-6, -4], [-4, -4], [-2, -4], [0, -4],
    [-10, -6], [-8, -6], [-6, -6], [-4, -6], [-2, -6], [0, -6],
    [-10, -8], [-8, -8], [-6, -8], [-4, -8], [-2, -8], [0, -8],
    [-10, -10], [-8, -10], [-6, -10], [-4, -10], [-2, -10], [0, -10]
])
distances = np.sqrt(np.sum(points**2, axis=1))
phis = np.arctan2(points[:, 1], points[:, 0])
fixed_mapping = {tuple([d, phi]): i + 1 for i, (d, phi) in enumerate(zip(distances, phis))}

# Convert train_y to labels using the mapping
def find_label_with_tolerance(pair, mapping, tolerance=1e-2):
    distance, phi = pair
    for key in mapping.keys():
        if abs(key[0] - distance) < tolerance and abs(key[1] - phi) < tolerance:
            return mapping[key]
    raise KeyError(f"No match found for {pair}")

labels = np.array([find_label_with_tolerance(pair, fixed_mapping) for pair in train_y])

# Flatten train_x and concatenate with labels
train_x_flattened = train_x.reshape(train_x.shape[0], -1)  # Flatten train_x if it's not already in 2D
data_to_save = np.hstack((labels.reshape(-1, 1), train_x_flattened))

# Save the original data to a CSV file without augmentation
original_data_filename = "/Users/joshhuang/PythonFolder/ONR/Hybrid/TS-GAN/Training.csv"
pd.DataFrame(data_to_save).to_csv(original_data_filename, index=False, header=False)
print(f"Saved original data to {original_data_filename}")

# Define label ranges and target counts for augmentation
label_ranges = [(1, 35)]
target_count = 50  # Target number of records per label range

def augment_data_with_noise(data, target_count, noise_level=0.01):
    replicates = target_count // len(data)
    remainder = target_count % len(data)
    augmented_data = pd.concat([data] * replicates + [data.iloc[:remainder]])

    # Identifying feature columns (assuming label is in the first column)
    feature_columns = augmented_data.columns[40:]  # Adjust if label is not the first column

    # Adding random noise only to feature columns
    noise = np.random.normal(loc=0.0, scale=noise_level, size=(augmented_data.shape[0], len(feature_columns)))
    augmented_data[feature_columns] += noise

    return augmented_data


# Function to replicate data to reach a target count
def augment_data(data, target_count):
    replicates = target_count // len(data)
    remainder = target_count % len(data)
    return pd.concat([data] * replicates + [data.iloc[:remainder]])


def interpolate_samples(data, target_count):
    additional_samples_needed = target_count - len(data)
    new_samples = []
    for _ in range(additional_samples_needed):
        idx1, idx2 = np.random.choice(len(data), 2, replace=False)
        interpolation = data.iloc[idx1] + (data.iloc[idx2] - data.iloc[idx1]) * np.random.random(size=data.shape[1])
        new_samples.append(interpolation)
    augmented_data = pd.concat([data] + new_samples, ignore_index=True)
    return augmented_data


# Process each label range
for start, end in label_ranges:
    # Find all unique labels within the current range
    unique_labels_in_range = np.unique(data_to_save[:, 0][(data_to_save[:, 0] >= start) & (data_to_save[:, 0] <= end)])

    # Initialize an empty DataFrame to hold all augmented data for this range
    augmented_data_range = pd.DataFrame()

    # Iterate through each unique label in the current range
    for label in unique_labels_in_range:
        # Filter data for the specific label
        mask = (data_to_save[:, 0] == label)
        filtered_data = data_to_save[mask]

        # Augment data by replication if necessary
        if len(filtered_data) < target_count:
            filtered_data = augment_data_with_noise(pd.DataFrame(filtered_data), target_count)
        else:
            filtered_data = pd.DataFrame(filtered_data)

        # Append the augmented data for this label to the range DataFrame
        augmented_data_range = pd.concat([augmented_data_range, filtered_data], ignore_index=True)

    # Save the aggregated augmented data for this label range to a CSV file
    output_filename = "/Users/joshhuang/PythonFolder/ONR/Hybrid/TS-GAN/Training1.csv"
    augmented_data_range.to_csv(output_filename, index=False, header=False)
    print(f"Saved augmented data for labels {start} to {end} to {output_filename}")


InvalidIndexError: (slice(None, None, None), slice(None, -2, None))

## Back code

In [4]:
import numpy as np
import pandas as pd

from utils import load_data, preprocess_data, preprocess_data1, reshape_data

fileloc = "/Users/joshhuang/MATLAB/ONR/Localization/"
filename = "36_Locations_1.csv"

# Load and preprocess the data
df = load_data(fileloc, filename)
scaled_data = preprocess_data1(df)
train, test = split_data(scaled_data)
train_x, train_y, test_x, test_y = reshape_data(train, test)

# Define the mapping from distances and angles to labels
points = np.array([
    [-10, 0], [-8, 0], [-6, 0], [-4, 0], [-2, 0],
    [-10, -2], [-8, -2], [-6, -2], [-4, -2], [-2, -2], [0, -2],
    [-10, -4], [-8, -4], [-6, -4], [-4, -4], [-2, -4], [0, -4],
    [-10, -6], [-8, -6], [-6, -6], [-4, -6], [-2, -6], [0, -6],
    [-10, -8], [-8, -8], [-6, -8], [-4, -8], [-2, -8], [0, -8],
    [-10, -10], [-8, -10], [-6, -10], [-4, -10], [-2, -10], [0, -10]
])
distances = np.sqrt(np.sum(points**2, axis=1))
phis = np.arctan2(points[:, 1], points[:, 0])
fixed_mapping = {tuple([d, phi]): i + 1 for i, (d, phi) in enumerate(zip(distances, phis))}

# Convert train_y to labels using the mapping
def find_label_with_tolerance(pair, mapping, tolerance=1e-2):
    distance, phi = pair
    for key in mapping.keys():
        if abs(key[0] - distance) < tolerance and abs(key[1] - phi) < tolerance:
            return mapping[key]
    raise KeyError(f"No match found for {pair}")

labels = np.array([find_label_with_tolerance(pair, fixed_mapping) for pair in train_y])

# Flatten train_x and concatenate with labels
train_x_flattened = train_x.reshape(train_x.shape[0], -1)  # Flatten train_x if it's not already in 2D
data_to_save = np.hstack((labels.reshape(-1, 1), train_x_flattened))

# Define label ranges and target counts for augmentation
label_ranges = [(1, 35)]
target_count = 100  # Target number of records per label range

# Function to replicate data to reach a target count
def augment_data(data, target_count):
    replicates = target_count // len(data)
    remainder = target_count % len(data)
    return pd.concat([data] * replicates + [data.iloc[:remainder]])

# Process each label range
for start, end in label_ranges:
    # Find all unique labels within the current range
    unique_labels_in_range = np.unique(data_to_save[:, 0][(data_to_save[:, 0] >= start) & (data_to_save[:, 0] <= end)])

    # Initialize an empty DataFrame to hold all augmented data for this range
    augmented_data_range = pd.DataFrame()

    # Iterate through each unique label in the current range
    for label in unique_labels_in_range:
        # Filter data for the specific label
        mask = (data_to_save[:, 0] == label)
        filtered_data = data_to_save[mask]

        # Augment data by replication if necessary
        if len(filtered_data) < target_count:
            filtered_data = augment_data(pd.DataFrame(filtered_data), target_count)
        else:
            filtered_data = pd.DataFrame(filtered_data)

        # Append the augmented data for this label to the range DataFrame
        augmented_data_range = pd.concat([augmented_data_range, filtered_data], ignore_index=True)

    # Save the aggregated augmented data for this label range to a CSV file
    output_filename = f"Training_labels_{start}_to_{end}.csv"
    augmented_data_range.to_csv(output_filename, index=False, header=False)
    print(f"Saved augmented data for labels {start} to {end} to {output_filename}")


Saved augmented data for labels 1 to 35 to Training_labels_1_to_35.csv
