In [9]:
print("Hello World")

Hello World


In [10]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [11]:
def generate_synthetic_data(n=100):
    """Generate synthetic data for treatment and control groups."""
    np.random.seed(42)
    data = pd.DataFrame({
        'id': np.arange(n),
        'age': np.random.randint(20, 70, n),
        'risk_score': np.random.rand(n),
        'treatment': np.random.choice([0, 1], n)  # 0 = control, 1 = treated
    })
    return data

In [12]:
data = generate_synthetic_data()
print(data.head())

   id  age  risk_score  treatment
0   0   58    0.423401          1
1   1   48    0.394882          0
2   2   34    0.293488          1
3   3   62    0.014080          0
4   4   27    0.198842          1


In [13]:
print(data.describe())

               id         age  risk_score   treatment
count  100.000000  100.000000  100.000000  100.000000
mean    49.500000   44.070000    0.484004    0.560000
std     29.011492   14.447575    0.270784    0.498888
min      0.000000   20.000000    0.000520    0.000000
25%     24.750000   33.000000    0.255530    0.000000
50%     49.500000   43.000000    0.463418    1.000000
75%     74.250000   58.000000    0.694968    1.000000
max     99.000000   69.000000    0.997740    1.000000


In [14]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          100 non-null    int64  
 1   age         100 non-null    int32  
 2   risk_score  100 non-null    float64
 3   treatment   100 non-null    int64  
dtypes: float64(1), int32(1), int64(2)
memory usage: 2.9 KB
None


In [15]:
def balanced_risk_set_matching(data):
    """Perform Balanced Risk Set Matching (BRSM)."""
    treated = data[data['treatment'] == 1]  
    control = data[data['treatment'] == 0] 
    
    matched_pairs = []

    nbrs = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(control[['age', 'risk_score']])
    distances, indices = nbrs.kneighbors(treated[['age', 'risk_score']])

    for i, index in enumerate(indices.flatten()):
        matched_pairs.append((treated.iloc[i]['id'], control.iloc[index]['id']))

    return matched_pairs


In [16]:
if __name__ == "__main__":
    data = generate_synthetic_data() 
    matches = balanced_risk_set_matching(data)
    
    print("Matched Pairs (Treated ID, Control ID):")
    print(matches)


Matched Pairs (Treated ID, Control ID):
[(np.float64(0.0), np.float64(6.0)), (np.float64(2.0), np.float64(76.0)), (np.float64(4.0), np.float64(48.0)), (np.float64(8.0), np.float64(16.0)), (np.float64(9.0), np.float64(10.0)), (np.float64(11.0), np.float64(89.0)), (np.float64(14.0), np.float64(89.0)), (np.float64(15.0), np.float64(22.0)), (np.float64(17.0), np.float64(22.0)), (np.float64(18.0), np.float64(89.0)), (np.float64(21.0), np.float64(6.0)), (np.float64(24.0), np.float64(77.0)), (np.float64(27.0), np.float64(55.0)), (np.float64(28.0), np.float64(45.0)), (np.float64(31.0), np.float64(93.0)), (np.float64(32.0), np.float64(52.0)), (np.float64(33.0), np.float64(60.0)), (np.float64(34.0), np.float64(76.0)), (np.float64(36.0), np.float64(19.0)), (np.float64(37.0), np.float64(50.0)), (np.float64(39.0), np.float64(99.0)), (np.float64(41.0), np.float64(48.0)), (np.float64(42.0), np.float64(6.0)), (np.float64(43.0), np.float64(87.0)), (np.float64(46.0), np.float64(76.0)), (np.float64(53.0)