# Synthetic Data Generation for TabPFN Fine-Tuning
This notebook demonstrates how to generate synthetic tabular data for use in the finetuning pipeline, following the PRD requirements.

## 1. Define Data Schema
Specify the features, target, and types.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_regression

In [2]:
# Example schema: 10 features (8 numeric, 2 categorical), 1 target (binary)
n_samples = 1000
n_numeric = 8
n_categorical = 2
n_features = n_numeric + n_categorical

# Generate numeric features
X_numeric = np.random.randn(n_samples, n_numeric)

# Generate categorical features
X_categorical = np.random.choice(['A', 'B', 'C'], size=(n_samples, n_categorical))

# Combine features
X = np.concatenate([X_numeric, X_categorical.astype(str)], axis=1)

# Create DataFrame
columns = [f'num_{i}' for i in range(n_numeric)] + [f'cat_{i}' for i in range(n_categorical)]
df = pd.DataFrame(X, columns=columns)

# Generate binary target
y = np.random.randint(0, 2, size=n_samples)
df['target'] = y
df.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,cat_0,cat_1,target
0,0.4434751296528315,1.0015557267372348,-0.8748565456380197,0.0789315523964376,-0.9501835220551428,2.509368021099972,-0.6634607095165929,1.1461304620994412,A,C,0
1,-0.3148968597402959,-0.6927515051124514,-0.4648577730233012,0.3950561731323628,-0.1963836766349201,-0.3337605434469746,0.4299340782120139,-0.6730024883737376,A,A,0
2,0.6575786809615809,1.0495877705619734,-0.0292213340895228,0.5698000198255928,-0.0488040121222156,-1.2886524407493514,-0.5461042740412737,1.4122460130072478,A,B,1
3,-1.9114963112643124,-0.6031884569066248,0.5210008631400757,0.3575781009347846,-1.9874526629903744,-0.5835861166635888,-1.144097320353737,0.1261305173899266,A,C,1
4,-0.0625431280778106,0.2536536674067324,1.613552836593198,0.4634262014924583,-2.660828815997048,-1.4072385214350829,-0.1572734035519005,-0.7382648812265631,C,C,1


## 2. Generate Data for Other Tasks
You can adapt the code to generate multiclass or regression targets as needed.

In [3]:
# Multiclass target example
y_multi = np.random.randint(0, 4, size=n_samples)
df_multi = df.copy()
df_multi['target'] = y_multi
df_multi.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,cat_0,cat_1,target
0,0.4434751296528315,1.0015557267372348,-0.8748565456380197,0.0789315523964376,-0.9501835220551428,2.509368021099972,-0.6634607095165929,1.1461304620994412,A,C,3
1,-0.3148968597402959,-0.6927515051124514,-0.4648577730233012,0.3950561731323628,-0.1963836766349201,-0.3337605434469746,0.4299340782120139,-0.6730024883737376,A,A,1
2,0.6575786809615809,1.0495877705619734,-0.0292213340895228,0.5698000198255928,-0.0488040121222156,-1.2886524407493514,-0.5461042740412737,1.4122460130072478,A,B,3
3,-1.9114963112643124,-0.6031884569066248,0.5210008631400757,0.3575781009347846,-1.9874526629903744,-0.5835861166635888,-1.144097320353737,0.1261305173899266,A,C,0
4,-0.0625431280778106,0.2536536674067324,1.613552836593198,0.4634262014924583,-2.660828815997048,-1.4072385214350829,-0.1572734035519005,-0.7382648812265631,C,C,0


In [4]:
# Regression target example
y_reg = np.random.randn(n_samples)
df_reg = df.copy()
df_reg['target'] = y_reg
df_reg.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,cat_0,cat_1,target
0,0.4434751296528315,1.0015557267372348,-0.8748565456380197,0.0789315523964376,-0.9501835220551428,2.509368021099972,-0.6634607095165929,1.1461304620994412,A,C,0.682553
1,-0.3148968597402959,-0.6927515051124514,-0.4648577730233012,0.3950561731323628,-0.1963836766349201,-0.3337605434469746,0.4299340782120139,-0.6730024883737376,A,A,-0.734809
2,0.6575786809615809,1.0495877705619734,-0.0292213340895228,0.5698000198255928,-0.0488040121222156,-1.2886524407493514,-0.5461042740412737,1.4122460130072478,A,B,0.730582
3,-1.9114963112643124,-0.6031884569066248,0.5210008631400757,0.3575781009347846,-1.9874526629903744,-0.5835861166635888,-1.144097320353737,0.1261305173899266,A,C,1.275016
4,-0.0625431280778106,0.2536536674067324,1.613552836593198,0.4634262014924583,-2.660828815997048,-1.4072385214350829,-0.1572734035519005,-0.7382648812265631,C,C,0.129379


## 3. Save Synthetic Data
Save the generated data for use in the finetuning pipeline.

In [5]:
df.to_csv('../models/synthetic_binary.csv', index=False)
df_multi.to_csv('../models/synthetic_multiclass.csv', index=False)
df_reg.to_csv('../models/synthetic_regression.csv', index=False)

## 4. Documentation
- **Schema:** 8 numeric, 2 categorical, 1 target.
- **Distributions:** Numeric: standard normal; Categorical: uniform over A/B/C; Target: binary/multiclass/regression.
- **Reproducibility:** Set random seeds as needed.
- **Usage:** Data is ready for use in TabPFN finetuning pipeline.