In [None]:
import pandas as pd
import numpy as np
import os

input_folder = "input"       
output_folder = "synthetic_data"     

identifier_columns = ['MatchID', 'Team', 'Opponent', 'StartFrameID', 'EndFrameID', 'Intervall']
float_columns = ['Danger']  
diff_columns = ['Shot', 'BP', 'BP3rd', 'BPBox', 'Goal', 'Cross','PassBox', 'Pass3rd', 'Corner', 'TackWon', 'OutpOpp', 'EntrBox', 'Entr3rd', 'Danger']

def synthesize_float_column(col_data):
    col_data = col_data.dropna()
    if col_data.empty:
        return np.zeros(len(col_data))

    min_val = col_data.min()
    max_val = col_data.max()

    try:
        from scipy.stats import gaussian_kde
        kde = gaussian_kde(col_data)
        samples = kde.resample(size=len(col_data)).flatten()
        samples = np.clip(samples, min_val, max_val)
    except ImportError:
        samples = np.random.choice(col_data, size=len(col_data), replace=True)
    return samples

def synthesize_int_column(col_data):
    col_data = col_data.dropna().astype(int)
    if col_data.empty:
        return np.zeros(len(col_data), dtype=int)

    value_counts = col_data.value_counts(normalize=True)
    values = value_counts.index.values
    probabilities = value_counts.values

    return np.random.choice(values, size=len(col_data), p=probabilities)

In [None]:
match_counter = 1
team_id_counter = 1
team_name_map = {}

def generate_TeamData (df_team):  
    df_identifiers = df_team[identifier_columns].copy().reset_index(drop=True)
    df_data = df_team.drop(columns=identifier_columns).reset_index(drop=True)

    df_team_synth = pd.DataFrame()
    for col in df_data.columns:
        if col in float_columns:
            df_team_synth[col] = synthesize_float_column(df_data[col])
        else:
            df_team_synth[col] = synthesize_int_column(df_data[col])
    
    df_team_synth = df_team_synth.reset_index(drop=True)
    df_team_synth = pd.concat([df_identifiers, df_team_synth], axis=1)

    return df_team_synth


for filename in sorted(os.listdir(input_folder)):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_folder, filename)
        df = pd.read_csv(file_path, delimiter=';', decimal=',')

        team_ids = df['Team'].unique()
        df_team1 = df[df['Team'] == team_ids[0]].copy().reindex()
        df_team2 = df[df['Team'] == team_ids[1]].copy().reindex()

        df_team1_synth = generate_TeamData (df_team1)
        df_team2_synth = generate_TeamData (df_team2)
        
        for col in diff_columns:
            col_diff = f"{col}_diff"
            df_team1_synth[col_diff] = df_team1_synth[col] - df_team2_synth[col]
            df_team2_synth[col_diff] = df_team2_synth[col] - df_team1_synth[col]

        df_combined = pd.concat([df_team1_synth, df_team2_synth], ignore_index=True)
               
        match_id_new = f"Match_synth_{match_counter:02d}"
        df_combined['MatchID'] = match_id_new
        match_counter += 1

        for original_team in team_ids:
            if original_team not in team_name_map:
                team_name_map[original_team] = f"Team_synth_{team_id_counter:02d}"
                team_id_counter += 1

        df_combined['Team'] = df_combined['Team'].map(team_name_map)
        df_combined['Opponent'] = df_combined['Opponent'].map(team_name_map)
        
        output_filename = f'{match_id_new}.csv'
        output_path = os.path.join(output_folder, output_filename)
        df_combined.to_csv(output_path, index=False, sep=';', decimal=',')