In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal
import random

In [2]:
file_path = r"C:\Users\Pawel\OneDrive - University of Gdansk (for Students)\Teaching\Statystyka\Statystyka\Dane\Wprowadzenie\Frieren.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Character,Class,Race,Mana,PowerLevel,TrainingHours,Age
0,Demon King,Mage,Demon,168.12,131.38,9,880
1,Heiter,Ranger,Human,51.18,39.59,4,73
2,Frieren,Mage,Elf,151.27,120.36,10,821
3,Schnee,Ranger,Dwarf,64.25,72.88,16,287
4,Eisen,Warrior,Dwarf,56.87,88.14,16,54


In [3]:
def generate_data(sample_size, data, group_var):
    """
    Generate data considering group-specific parameters and proportions,
    with adjustments to round Age, TrainingHours, and set specified ranges.
    
    Parameters:
    - sample_size: Total number of samples to generate for the dataset.
    - data: The original DataFrame from which to infer parameters.
    - group_var: The variable name for grouping (e.g., 'Class' or 'Race').

    Returns:
    - A DataFrame with the generated data.
    """
    generated_samples = []
    group_counts = data[group_var].value_counts()
    total_count = group_counts.sum()
    
    for group, count in group_counts.items():
        # Determine the number of samples to generate for this group
        group_sample_size = int(sample_size * (count / total_count))
        
        # Extract data for this group
        group_data = data[data[group_var] == group]
        
        # Calculate means, stds, and correlation matrix for numerical variables
        mean_std = group_data[['Mana', 'PowerLevel', 'TrainingHours', 'Age']].agg(['mean', 'std']).to_dict()
        correlation_matrix = group_data[['Mana', 'PowerLevel', 'TrainingHours', 'Age']].corr().values
        
        # Create covariance matrix from the correlation matrix and standard deviations
        stds = [mean_std[var]['std'] for var in ['Mana', 'PowerLevel', 'TrainingHours', 'Age']]
        cov_matrix = correlation_matrix * np.outer(stds, stds)
        
        # Generate samples using the multivariate normal distribution
        samples = multivariate_normal.rvs(mean=[mean_std[var]['mean'] for var in ['Mana', 'PowerLevel', 'TrainingHours', 'Age']],
                                          cov=cov_matrix, size=group_sample_size)
        
        # Create DataFrame and adjust values according to specifications
        columns = ['Mana', 'PowerLevel', 'TrainingHours', 'Age']
        group_df = pd.DataFrame(samples, columns=columns)
        
        # Correct for any negative values, round as specified, and enforce ranges
        group_df['Mana'] = group_df['Mana'].abs()
        group_df['PowerLevel'] = group_df['PowerLevel'].abs()
        group_df['TrainingHours'] = group_df['TrainingHours'].clip(lower=0, upper=20).round()
        group_df['Age'] = group_df['Age'].clip(lower=10).round()  # Enforce minimum age of 10
        
        # Assign group label
        group_df[group_var] = group
        
        generated_samples.append(group_df)
    
    # Combine all generated samples into a single DataFrame
    full_generated_data = pd.concat(generated_samples).reset_index(drop=True)
    
    return full_generated_data

In [4]:
# Usage
sample_size = random.randint(1_000_000, 3_000_000)  # Generate a random sample size in the specified range
generated_data = generate_data(sample_size, data, 'Race')  # Assuming 'data' is your DataFrame
generated_data.head()

Unnamed: 0,Mana,PowerLevel,TrainingHours,Age,Race
0,46.380432,25.363807,5.0,58.0,Human
1,45.053015,62.675728,14.0,48.0,Human
2,86.272901,90.460964,12.0,33.0,Human
3,54.006144,89.222216,19.0,54.0,Human
4,76.35457,67.035221,10.0,65.0,Human


In [5]:
# To save the generated dataset as a CSV file
file_path = r"C:\Users\Pawel\OneDrive - University of Gdansk (for Students)\Teaching\Statystyka\Statystyka\Dane\Wprowadzenie\Frieren_census.csv"
generated_data.to_csv(file_path, index=False)