In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Generate synthetic data by adding Gaussian noise
def generate_synthetic_data(df, n_records, noise_factor=0.05):
    synthetic_data = []
    for _ in range(n_records):
        row = df.sample(n=1).values
        noise = np.random.normal(loc=0, scale=noise_factor, size=row.shape)
        synthetic_data.append(row + noise)
    
    synthetic_df = pd.DataFrame(np.vstack(synthetic_data), columns=df.columns)
    synthetic_df['species'] = synthetic_df['species'].round().astype(int)  # Ensure species are integers
    return synthetic_df

# Generate 150 more synthetic records
new_records = generate_synthetic_data(df, n_records=150)

# Combine with the original dataset
augmented_df = pd.concat([df, new_records], ignore_index=True)

# Show the augmented dataset shape
print(f"New dataset shape: {augmented_df.shape}")


New dataset shape: (300, 5)
