In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv('data/stud1.csv')

# Function to augment numeric scores by +/- 3 and clip between 0 and 100
def augment_scores(df, num_augmentations=3000):
    augmented_data = []

    for _ in range(num_augmentations):
        augmented_row = df.sample(1).copy()  # Select a random row from original data

        # Augment numeric columns by adding random noise between -3 and +3
        for col in ['math_score', 'reading_score', 'writing_score']:
            noise = np.random.randint(-3, 4)  # Random noise between -3 and 3
            augmented_row[col] = np.clip(augmented_row[col].values[0] + noise, 0, 100)

        # Categorical columns remain unchanged, but you can also add noise if needed
        augmented_data.append(augmented_row)

    # Combine the original data with augmented data
    augmented_df = pd.concat(augmented_data, ignore_index=True)
    return augmented_df

# Example usage: Assume df is your original dataframe
augmented_df = augment_scores(df, num_augmentations=3000)

# Now augmented_df contains the original data + 3000 new rows


In [4]:
# Combine the original dataset with the augmented dataset
final_df = pd.concat([df, augmented_df], ignore_index=True)


In [5]:
final_df.to_csv('stud_new.csv')

In [7]:
final_df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [9]:
import numpy as np

# Function to augment continuous scores
def augment_scores(df, noise_range=(-1, 1)):
    augmented_df = df.copy()

    # Add random noise to the numeric scores columns
    for col in ['math_score', 'reading_score', 'writing_score']:
        noise = np.random.randint(noise_range[0], noise_range[1], size=df[col].shape)
        augmented_df[col] = np.clip(augmented_df[col] + noise, 0, 100)

    return augmented_df


In [10]:
import pandas as pd
import numpy as np

# Function to augment categorical features
def augment_categoricals(df):
    augmented_df = df.copy()

    # Randomly shuffle categorical features
    for col in ['lunch', 'test_preparation_course', 'gender', 'race_ethnicity', 'parental_level_of_education']:
        augmented_df[col] = np.random.choice(df[col], size=len(df))

    return augmented_df


In [11]:
def generate_augmented_data(df, num_augmentations=3000):
    augmented_data = []

    for _ in range(num_augmentations):
        # Randomly sample a row from the original data
        augmented_row = df.sample(1).copy()

        # Augment continuous features (scores) with noise
        augmented_row = augment_scores(augmented_row)

        # Augment categorical features by random sampling
        augmented_row = augment_categoricals(augmented_row)

        augmented_data.append(augmented_row)

    # Combine the augmented data into a new dataframe
    augmented_df = pd.concat(augmented_data, ignore_index=True)
    return augmented_df

# Example usage: Augment original data with 3000 new rows
augmented_df = generate_augmented_data(df, num_augmentations=3000)

# Combine with the original data
final_df = pd.concat([df, augmented_df], ignore_index=True)


In [12]:
final_df.shape

(4000, 8)

In [13]:
final_df.to_csv('stud_new.csv')