In [3]:
import pandas as pd
import numpy as np

# =============================================================================
# DATA PREPARATION SCRIPT: Table Tennis Neuromuscular & Technical Performance
# Version: 2.0 (Standardized for Open Science)
# Goal: Standardize headers, handle missing values, and ensure anonymity.
# =============================================================================

# 1. Load the raw dataset
# Using ';' as separator as identified in the source file.
df = pd.read_csv('Data_TableTennis.csv', sep=';')

# 2. Handle missing values and numeric formatting
# Previously, missing values were marked as "-". 
# We convert them to np.nan (standard for statistical analysis in Python).
def clean_and_parse_numeric(value):
    if isinstance(value, str):
        # Standardize decimal separator from comma to dot
        value = value.replace(',', '.')
        # Convert missing value placeholder "-" to NaN
        if value.strip() == '-': 
            return np.nan
    try:
        return float(value)
    except (ValueError, TypeError):
        return value

# Apply cleaning to all columns to ensure consistency
for col in df.columns:
    df[col] = df[col].apply(clean_and_parse_numeric)

# 3. Standardize Column Names (Global Cleaning)
# Step A: Remove all spaces, special characters, and convert to lowercase.
# We replace spaces with underscores for better readability in code.
df.columns = [col.replace(' ', '_').lower().strip('_') for col in df.columns]

# 4. Specific Mapping for International Publication
# This ensures that key variables match the terminology used in the manuscript.
english_mapping = {
    'group': 'group',
    'disability': 'impairment_type',
    'class': 'functional_class',
    'mass': 'body_mass_kg',
    'height': 'height_m',
    'wingspan': 'wingspan_m',
    'sj_best': 'squat_jump_max_cm',
    'cmj_best': 'cmj_max_cm',
    'drop_15_rsi_best': 'rsi_15cm_max',
    'drop_30_rsi_best': 'rsi_30cm_max',
    'drop_45_rsi_best': 'rsi_45cm_max',
    'sprint_5m_best': 'sprint_5m_sec',
    'radar_vel_best': 'ball_velocity_max_kmh',
    'target_efficiency': 'technical_efficiency_index'
}

# 5. Data Transformation
# Binarize group: 0 = Conventional, 1 = Para-athlete
df['group'] = df['group'].map({0.0: 0, 1.0: 1, 'Convencional': 0, 'Para-athlete': 1})

# 6. Privacy: Remove identifying or empty columns
if 'birth' in df.columns:
    df = df.drop(columns=['birth'])

# 7. Finalize and Export
df_ready = df.rename(columns=english_mapping)

output_file = 'Data_TableTennis_Ready.csv'
df_ready.to_csv(output_file, index=False, sep=',')

print(f"--- SUCCESS ---")
print(f"Missing values ('-') have been converted to NaN.")
print(f"All column spaces removed and standardized to lowercase.")
print(f"Dataset saved as: {output_file}")
# Displaying first few cleaned headers as example
print(f"Sample headers: {list(df_ready.columns[:15])}")

--- SUCCESS ---
Missing values ('-') have been converted to NaN.
All column spaces removed and standardized to lowercase.
Dataset saved as: Data_TableTennis_Ready.csv
Sample headers: ['id', 'group', 'impairment_type', 'functional_class', 'body_mass_kg', 'height_m', 'wingspan_m', 'sj_1', 'sj_2', 'squat_jump_max_cm', 'cmj_1', 'cmj_2', 'cmj_max_cm', 'cmj_uni_d_1', 'cmj_uni_d_2']
