In [1]:
import numpy as np
import pandas as pd


## 🔹 1. Simple Randomization (Coin Flip)

- Each unit (user/cluster) is independently randomized.
    * May result in imbalance (e.g., 7 treatment, 3 control).
	* Fine when large samples are available.

In [2]:

n_clusters = 10
assignment = np.random.choice([0, 1], size=n_clusters)
print(assignment)


[1 0 1 1 0 0 0 1 1 0]


## 🔹 2. Complete Randomization (Fixed Split)

- Ensure exact balance between groups.
- Guarantees equal treatment/control sizes (5 and 5 here).
- Common in A/B tests to avoid imbalance.

In [3]:
n_clusters = 5
half = n_clusters // 2

# Randomly decide which label gets the extra cluster
labels = [0, 1]
np.random.shuffle(labels)  # randomize label order

assignment = np.array([labels[0]]*half + [labels[1]]*(n_clusters - half))
np.random.shuffle(assignment)  # shuffle to randomize cluster assignment
print(assignment)

[0 0 1 1 0]


## 🔹 3. Blocked Randomization

Randomize within small blocks to maintain balance over time or within subgroups.
- Every 4 clusters → 2 control, 2 treatment.
- Prevents imbalance if experiment stops early.


In [4]:
n_clusters = 12
block_size = 4   # each block has 2 control, 2 treatment
assignment = []

for _ in range(n_clusters // block_size):
    block = np.array([0] * (block_size//2) + [1] * (block_size//2))
    np.random.shuffle(block)
    assignment.extend(block)

print(assignment)

[np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0)]


## 🔹 4. Stratified Randomization

First divide clusters into strata (based on covariates), then randomize within each stratum.

Example: Stratify clusters into “small” vs “large” based on size.
- Ensures treatment/control are balanced within strata.
- Useful if cluster size (or baseline metric) strongly influences outcome.

In [5]:


np.random.seed(42)
cluster_sizes = np.random.randint(50, 100, size=10)  # cluster sizes

df = pd.DataFrame({"cluster": range(10), "size": cluster_sizes})

df["stratum"] = np.where(df["size"] < 75, "small", "large")

assignments = []
for stratum, group in df.groupby("stratum"):
    n = len(group)
    assign = np.array([0]*(n//2) + [1]*(n - n//2))
    np.random.shuffle(assign)
    assignments.extend(assign)

df["treatment"] = assignments
print(df)

   cluster  size stratum  treatment
0        0    88   large          1
1        1    78   large          0
2        2    64   small          0
3        3    92   large          1
4        4    57   small          0
5        5    70   small          1
6        6    88   large          1
7        7    68   small          0
8        8    72   small          1
9        9    60   small          0


## 🔹 5. Re-Randomization / Constrained Randomization

### Simple re-randomization of the cluster-sizes
Randomize, then check balance on covariates. If imbalance is too high, randomize again.
- Guarantees covariate balance.
- Used in constrained randomization designs.

In [6]:
def randomize_until_balanced(cluster_sizes, threshold=5):
    n = len(cluster_sizes)
    while True:
        assignment = np.random.choice([0, 1], size=n)
        mean_diff = abs(cluster_sizes[assignment==1].mean() -
                        cluster_sizes[assignment==0].mean())
        if mean_diff < threshold:  # accept only if balanced
            return assignment

np.random.seed(42)
sizes = np.random.randint(50, 100, size=10)
assign = randomize_until_balanced(sizes, threshold=3)
print("Cluster sizes:", sizes)
print("Assignment:", assign)

Cluster sizes: [88 78 64 92 57 70 88 68 72 60]
Assignment: [1 1 1 0 0 0 0 0 0 1]


### Multi-variate re-rendomization
🧠 1️⃣ What it is

In single-covariate rerandomization (like your code), you only check one balance condition:

|mean(cluster size in treatment) − mean(cluster size in control)| ≤ threshold

But in real experiments, you often have several baseline covariates to balance:
- cluster size
- baseline engagement rate
- region
- average income
- etc.

You want all of them to be “well balanced” between treatment and control.

So you extend the rerandomization criterion to a vector of covariates.

⸻

⚙️ 2️⃣ How it’s done: the Mahalanobis distance approach

You compute a single multivariate balance metric that summarizes how different the two groups are across all covariates simultaneously.

That metric is the Mahalanobis distance between the treatment and control covariate means:

M = (\bar{x}_T - \bar{x}_C)’ \Sigma^{-1} (\bar{x}_T - \bar{x}_C)

where
- \bar{x}_T, \bar{x}_C = vectors of mean covariate values in treatment and control
- \Sigma = covariance matrix of covariates across all units

Then you:
1.	Randomize assignments.
2.	Compute M.
3.	If M < \text{threshold}, accept; otherwise, rerandomize.

This ensures overall covariate balance across all dimensions, not just one.

⸻


In [7]:
import numpy as np
import pandas as pd
from scipy.spatial import distance

# --------------------------
# Step 1: Define Mahalanobis distance
# --------------------------
def mahalanobis_distance(df_treat, df_ctrl, covariate_cols):
    # Convert covariates to numeric arrays
    x_t = df_treat[covariate_cols].astype(float).to_numpy()
    x_c = df_ctrl[covariate_cols].astype(float).to_numpy()
    
    # Mean vectors
    mean_t = x_t.mean(axis=0)
    mean_c = x_c.mean(axis=0)
    
    # Combine treatment + control for covariance estimation
    combined = np.vstack([x_t, x_c])
    cov = np.cov(combined, rowvar=False)
    inv_cov = np.linalg.inv(cov)
    
    # Use SciPy's mahalanobis() — requires two 1D vectors + inverse covariance
    M = distance.mahalanobis(mean_t, mean_c, inv_cov)
    return M

# --------------------------
# Step 2: Rerandomization loop
# --------------------------
def rerandomize_df(df, covariate_cols, threshold=0.3, max_iter=10000):
    n = len(df)
    for _ in range(max_iter):
        df["treatment"] = np.random.choice([0, 1], size=n)
        df_treat = df[df["treatment"] == 1]
        df_ctrl = df[df["treatment"] == 0]
        
        M = mahalanobis_distance(df_treat, df_ctrl, covariate_cols)
        
        if M < threshold:
            return df.copy(), M
    raise RuntimeError("Failed to find balanced randomization")

# --------------------------
# Step 3: Simulate cluster data
# --------------------------
np.random.seed(42)
df = pd.DataFrame({
    "cluster_id": range(100),
    "cluster_size": np.random.randint(50, 100, size=100),
    "baseline_rate": np.random.uniform(0.05, 0.15, size=100),
    "region": np.random.choice(["NA", "EU", "APAC"], size=100)
})

# Convert categorical variable 'region' to numeric dummies
df = pd.get_dummies(df, columns=["region"], drop_first=True)

# --------------------------
# Step 4: Run rerandomization
# --------------------------
covariates = ["cluster_size", "baseline_rate", "region_EU", "region_NA"]

balanced_df, M_final = rerandomize_df(df, covariates, threshold=0.3)
print(f"Accepted assignment with Mahalanobis distance: {M_final:.4f}")

# --------------------------
# Step 5: Check covariate balance
# --------------------------
summary = balanced_df.groupby("treatment")[covariates].mean().T
summary["abs_diff"] = abs(summary[0] - summary[1])
print(summary)

Accepted assignment with Mahalanobis distance: 0.2479
treatment              0          1  abs_diff
cluster_size   72.826087  75.129630  2.303543
baseline_rate   0.096849   0.099722  0.002872
region_EU       0.326087   0.277778  0.048309
region_NA       0.391304   0.351852  0.039452


 ## 🔹 6. Cluster Randomization 

Instead of randomizing individuals, you randomize entire clusters (groups of users).
- Example: In a social network, you don’t want friends in different arms because of interference. So you randomize at the cluster (friend group) level.- - Another example: Randomize whole schools into “new curriculum” vs “old curriculum.”

Steps:
1.	Identify clusters (e.g., schools, groups).
2.	Randomly assign whole clusters to treatment/control.
3.	All individuals in a cluster follow the cluster’s assignment.

In [8]:
import numpy as np
import pandas as pd

# Suppose we have 10 clusters (schools), each with multiple students
np.random.seed(42)
students = pd.DataFrame({
    "student_id": np.arange(1, 51),
    "cluster_id": np.repeat(np.arange(1, 11), 5)  # 10 clusters, 5 students each
})

# Randomly assign clusters to treatment/control
clusters = students["cluster_id"].unique()
treatment_clusters = np.random.choice(clusters, size=len(clusters)//2, replace=False)

# Mark assignment at cluster level
students["assignment"] = students["cluster_id"].apply(
    lambda c: "Treatment" if c in treatment_clusters else "Control"
)

print(students.head(15))

    student_id  cluster_id assignment
0            1           1  Treatment
1            2           1  Treatment
2            3           1  Treatment
3            4           1  Treatment
4            5           1  Treatment
5            6           2  Treatment
6            7           2  Treatment
7            8           2  Treatment
8            9           2  Treatment
9           10           2  Treatment
10          11           3    Control
11          12           3    Control
12          13           3    Control
13          14           3    Control
14          15           3    Control


## Stratified Block Randomization

This combines:
- Stratification: ensures balance across important covariates (e.g., gender, age group).
- Blocking: ensures balance within small blocks.

This is especially common in clinical trials where you want guaranteed balance across multiple dimensions.

Example:
- Suppose we want equal Treatment/Control assignment within each gender group.
- Within each stratum (Male/Female), we also randomize in blocks of size 4 to keep things balanced.

In [9]:
import numpy as np
import pandas as pd

np.random.seed(123)
# Simulated participants
participants = pd.DataFrame({
    "id": np.arange(1, 21),
    "gender": np.random.choice(["Male", "Female"], size=20)
})

def stratified_block_randomization(df, stratify_col, block_size=4):
    assignments = []
    for stratum, group in df.groupby(stratify_col):
        group_ids = group["id"].tolist()
        np.random.shuffle(group_ids)
        
        # Break into blocks
        for i in range(0, len(group_ids), block_size):
            block = group_ids[i:i+block_size]
            # Assign half to treatment, half to control
            half = len(block) // 2
            assignments.extend([(pid, "Treatment") for pid in block[:half]])
            assignments.extend([(pid, "Control") for pid in block[half:]])
    
    return pd.DataFrame(assignments, columns=["id", "assignment"])

assignments = stratified_block_randomization(participants, "gender", block_size=4)
participants = participants.merge(assignments, on="id")

print(participants.sort_values("id"))

    id  gender assignment
0    1    Male    Control
1    2  Female    Control
2    3    Male    Control
3    4    Male    Control
4    5    Male  Treatment
5    6    Male    Control
6    7    Male  Treatment
7    8  Female    Control
8    9  Female  Treatment
9   10    Male  Treatment
10  11  Female  Treatment
11  12  Female  Treatment
12  13    Male    Control
13  14  Female    Control
14  15    Male    Control
15  16  Female  Treatment
16  17    Male  Treatment
17  18  Female    Control
18  19  Female    Control
19  20    Male  Treatment


## 7. 🎯 Matched-Pair Randomization

This is often the final step up in sophistication after stratified randomization — and yes, Meta and similar companies use it frequently in clustered or small-sample experiments.

⸻

🧠 Intuition
- You pair units (or clusters) that are very similar across covariates.
- Then, randomly assign one unit from each pair to treatment and the other to control.
- This ensures tight covariate balance, even with small samples or heterogeneous clusters.

It’s like a “1-to-1” stratification, but using similarity across multiple features instead of a single categorical stratum.

⸻

⚙️ When it’s used
- When you have a small number of clusters (e.g., schools, cities, regions).
- When each cluster has rich metadata (e.g., population, revenue, baseline activity rate).
- When covariate balance is critical.


In [10]:

from sklearn.metrics import pairwise_distances

# Step 1: Simulate cluster data
np.random.seed(42)
df = pd.DataFrame({
    "cluster_id": range(1, 21),
    "size": np.random.randint(50, 200, 20),
    "baseline_engagement": np.random.uniform(0.1, 0.5, 20),
    "region_code": np.random.choice([0, 1, 2], 20)  # NA, EU, APAC (encoded)
})

# Step 2: Normalize covariates for distance computation
covariates = ["size", "baseline_engagement", "region_code"]
X = (df[covariates] - df[covariates].mean()) / df[covariates].std()

# Step 3: Compute pairwise distances and find nearest pairs
dist_matrix = pairwise_distances(X)
np.fill_diagonal(dist_matrix, np.inf)

pairs = []
unpaired = set(df.index)

while unpaired:
    i = unpaired.pop()
    j = np.argmin(dist_matrix[i])
    if j in unpaired:
        unpaired.remove(j)
        pairs.append((i, j))
print(pairs)

# Step 4: Randomly assign one in each pair to treatment, one to control
assignments = {}
for i, j in pairs:
    if np.random.rand() < 0.5:
        assignments[i], assignments[j] = "treatment", "control"
    else:
        assignments[i], assignments[j] = "control", "treatment"

df["group"] = df.index.map(assignments)

# Step 5: Check balance
summary = df.groupby("group")[covariates].mean().T
summary["abs_diff"] = abs(summary["treatment"] - summary["control"])
print(summary)

[(0, np.int64(12)), (1, np.int64(17)), (2, np.int64(18)), (3, np.int64(13)), (4, np.int64(6)), (7, np.int64(10))]
group                   control  treatment  abs_diff
size                 142.666667  137.50000  5.166667
baseline_engagement    0.228145    0.22406  0.004085
region_code            1.000000    1.00000  0.000000


## Social Media effect and handling of network effects through network clustering
1️⃣ What is network interference?
- Standard A/B tests assume the Stable Unit Treatment Value Assumption (SUTVA):
The outcome of one user is independent of the treatment assignment of other users.
- Network interference breaks this assumption:
- A user’s reaction can be influenced by friends, followers, or peers who are in treatment or control.
- Example: A new “reaction button” might spread via social influence, not just individual exposure.

Consequences:
- Standard randomization estimates can be biased.
- Treatment effect estimates ignore spillover effects, leading to over- or underestimation.



2️⃣ Key approaches to handle network interference

A. Cluster / Group Randomization
- Randomize at the group / network cluster level instead of individual users.
- Example: Randomize entire social circles or communities.
- Pros: Reduces cross-treatment contamination.
- Cons: Fewer clusters → lower statistical power.

B. Exposure modeling
- Model each user’s effective treatment as a combination of:
- Their own treatment
- Fraction of treated neighbors
- Creates continuous treatment intensity (0–1).
- Analysis requires regression or mixed models.

Conceptual formula:
$$
Y_i = \beta_0 + \beta_1 T_i + \beta_2 \frac{\text{number of treated neighbors}}{\text{number of neighbors}} + \epsilon_i
$$



C. Graph-based randomization
- Use the network graph structure to assign treatment:
- Maximize separation between treated and control neighbors.
- Avoid direct neighbors being in different groups.
- Techniques:
- Graph coloring or community detection.
- Assign entire detected communities to treatment or control.

⸻

D. Randomized saturation design
- Randomize treatment probability within clusters, not absolute treatment.
- Example:
- Cluster A: 70% treated, 30% control
- Cluster B: 30% treated, 70% control
- Allows estimation of spillover effects using different exposure levels.

In [11]:
import networkx as nx
from scipy import stats

np.random.seed(42)

# --------------------------
# Step 1: Create a network
# --------------------------
n_users = 50

# Create a random graph (Erdos-Renyi)
p_edge = 0.1  # probability of connection
G = nx.erdos_renyi_graph(n_users, p_edge, seed=42)
print(G)
# Convert to dataframe for easier handling
df = pd.DataFrame({
    "user_id": list(G.nodes),
})

# Compute neighbors for each user
df['neighbors'] = df['user_id'].apply(lambda u: list(G.neighbors(u)))
df['n_neighbors'] = df['neighbors'].apply(len)

# --------------------------
# Step 2: Individual randomization
# --------------------------
df['treatment_individual'] = np.random.choice([0,1], size=n_users)

# Define outcome with spillover: 
# Base rate + 0.2 * own treatment + 0.1 * fraction of treated neighbors + noise
def compute_outcome(row, treatment_col='treatment_individual'):
    frac_treated_neighbors = np.mean([df.loc[n, treatment_col] for n in row['neighbors']]) if row['n_neighbors'] > 0 else 0
    return 0.05 + 0.2*row[treatment_col] + 0.1*frac_treated_neighbors + np.random.normal(0,0.01)

df['outcome_individual'] = df.apply(compute_outcome, axis=1)

# Estimate naive treatment effect (ignores spillover)
treat_mean = df[df['treatment_individual']==1]['outcome_individual'].mean()
control_mean = df[df['treatment_individual']==0]['outcome_individual'].mean()
naive_effect = treat_mean - control_mean
print(f"Naive treatment effect (individual randomization): {naive_effect:.3f}")

# --------------------------
# Step 3: Cluster randomization
# --------------------------
# Assign users into 5 clusters (randomly)
n_clusters = 5
cluster_ids = np.random.choice(range(n_clusters), size=n_users)
df['cluster_id'] = cluster_ids

# Randomize treatment at cluster level
treat_clusters = np.random.choice(range(n_clusters), size=n_clusters//2, replace=False)
df['treatment_cluster'] = df['cluster_id'].apply(lambda c: 1 if c in treat_clusters else 0)

# Compute outcome again using spillover
df['outcome_cluster'] = df.apply(lambda row: compute_outcome(row, treatment_col='treatment_cluster'), axis=1)

# Estimate effect using cluster-level treatment
treat_mean_cl = df[df['treatment_cluster']==1]['outcome_cluster'].mean()
control_mean_cl = df[df['treatment_cluster']==0]['outcome_cluster'].mean()
cluster_effect = treat_mean_cl - control_mean_cl
print(f"Estimated treatment effect (cluster randomization): {cluster_effect:.3f}")

# --------------------------
# Step 4: Observations
# --------------------------
print("\nNumber of treated vs control users")
print(df.groupby('treatment_cluster')['user_id'].count())

Graph with 50 nodes and 120 edges
Naive treatment effect (individual randomization): 0.203
Estimated treatment effect (cluster randomization): 0.189

Number of treated vs control users
treatment_cluster
0    27
1    23
Name: user_id, dtype: int64


# --------------------------------------------------------------------------------
# Practice

## 🔹 1. Simple Randomization (Coin Flip)

- Each unit (user/cluster) is independently randomized.
    * May result in imbalance (e.g., 7 treatment, 3 control).
	* Fine when large samples are available.

In [12]:
sample_size = 10
labels = np.random.choice([0, 1], size=sample_size)
labels

array([1, 0, 0, 1, 0, 1, 0, 1, 0, 1])

## 🔹 2. Complete Randomization (Fixed Split)

- Ensure exact balance between groups.
- Guarantees equal treatment/control sizes (5 and 5 here).
- Common in A/B tests to avoid imbalance.

In [13]:
sample_size = 10
half = sample_size // 2

labels = np.array([0 , 1])
np.random.shuffle(labels)

assignments = np.array([labels[0]]* half + [labels[1]]* (sample_size - half))
np.random.shuffle(assignments)
assignments

array([0, 0, 1, 0, 1, 0, 0, 1, 1, 1])

## 🔹 3. Blocked Randomization

Randomize within small blocks to maintain balance over time or within subgroups.
- Every 4 clusters → 2 control, 2 treatment.
- Prevents imbalance if experiment stops early.


In [14]:
sample_size = 100
block_size = 10
labels = np.array([0,1])
half = block_size // 2
assignments = []
for _ in range(sample_size // block_size):
    np.random.shuffle(labels)
    block = np.array([labels[0]] * half + [labels[1]] * (block_size - half))
    np.random.shuffle(block)
    assignments.extend(block)
assignments[0:10]


[np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(1),
 np.int64(1),
 np.int64(1)]

## 🔹 4. Stratified Randomization

First divide clusters into strata (based on covariates), then randomize within each stratum.

Example: Stratify clusters into “small” vs “large” based on size.
- Ensures treatment/control are balanced within strata.
- Useful if cluster size (or baseline metric) strongly influences outcome.

In [15]:
np.random.seed(42)
num_clusters = 30
cluster_sizes = np.random.randint(50, 100,  size=num_clusters)
df = pd.DataFrame({"cluster":range(num_clusters),
                   "size": cluster_sizes
                   })
df["stratum"] = np.where(df['size'] > 75, "large", "small" )
grp_labels = [0, 1]
assigments = []
for startum, group in df.groupby("stratum"):
    np.random.shuffle(grp_labels)
    grp_size = len(group)
    print(grp_size)
    half = grp_size // 2
    labels = np.array([grp_labels[0]] * half + [grp_labels[1]] * (grp_size - half) )
    np.random.shuffle(labels)
    assigments.extend(labels)
print(assigments)
df["group"] = assigments
df

12
18
[np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0)]


Unnamed: 0,cluster,size,stratum,group
0,0,88,large,1
1,1,78,large,0
2,2,64,small,0
3,3,92,large,1
4,4,57,small,1
5,5,70,small,0
6,6,88,large,1
7,7,68,small,1
8,8,72,small,0
9,9,60,small,1


### 🎯 What stratified randomization means

Stratified randomization ensures balance within key subgroups (strata).
You divide your population into homogeneous groups (strata) based on an important feature — then randomize within each stratum.

⸻

🧠 Two ways to form strata
1.	Derived strata (like cluster size)
- You compute a stratum variable (e.g., cluster_size > median → "large" else "small").
- Used when you don’t already have a clear categorical variable.
2.	Existing strata (like age, gender, region)
- You use an existing feature directly.
- For instance, if your dataset has an age column, you can stratify by it:
- Continuous → convert into bins (e.g., 18–25, 26–40, 41–60, 60+).
- Categorical → use as-is (e.g., region, gender).

In [16]:
import numpy as np
import pandas as pd

# Simulated data
np.random.seed(42)
df = pd.DataFrame({
    "user_id": range(1, 501),
    "age": np.random.randint(18, 70, 500),
    "region": np.random.choice(["NA", "EU", "APAC"], size=500)
})

# Step 1: Create age strata (existing feature -> binned)
df["age_group"] = pd.cut(df["age"], bins=[17, 25, 40, 60, 100],
                         labels=["18-25", "26-40", "41-60", "60+"])

# Step 2: Stratified randomization
df["group"] = None
for stratum, group_data in df.groupby("age_group"):
    treated = np.random.choice(group_data.index, size=len(group_data)//2, replace=False)
    df.loc[group_data.index, "group"] = "control"
    df.loc[treated, "group"] = "treatment"

# Step 3: Verify balance by stratum
print(df.groupby(["age_group", "group"]).size().unstack())

group      control  treatment
age_group                    
18-25           41         40
26-40           59         59
41-60          105        104
60+             46         46


  for stratum, group_data in df.groupby("age_group"):
  print(df.groupby(["age_group", "group"]).size().unstack())


In [17]:
df[df["group"] == 0 ].size

0

In [18]:
df[df["group"] == 1 ].size

0

In [19]:
(df[(df["group"] == 0) & (df['stratum'] == 'small')].size)/(df[(df["group"] == 0) & (df['stratum'] == 'large')].size) 

KeyError: 'stratum'

In [None]:
(df[df['stratum'] == 'small'].size) / df[df['stratum'] == 'large'].size 

1.5

## 🔹 5. Re-Randomization / Constrained Randomization

Randomize, then check balance on covariates. If imbalance is too high, randomize again.
- Guarantees covariate balance.
- Used in constrained randomization designs.

In [None]:
n_clusters = 100
cluster_sizes = np.random.randint(50, 100, size = n_clusters)
threshold = 3

while True:
    assignments = np.random.choice([0, 1], size=n_clusters)
    label_1_mean = cluster_sizes[assignments == 1].mean()
    label_0_mean = cluster_sizes[assignments == 0].mean()
    if abs(label_1_mean -label_0_mean) <= threshold:
        break;
        

print(cluster_sizes)
print(assignments)



[94 83 96 89 72 73 78 87 63 88 97 98 67 98 82 88 66 86 60 79 55 69 76 73
 72 94 55 64 99 79 60 61 96 88 80 61 55 78 79 96 56 66 57 57 52 94 79 94
 99 69 51 59 59 90 70 85 73 51 52 51 82 81 78 56 76 64 97 96 74 65 91 55
 92 82 63 67 78 50 59 72 51 79 58 65 88 96 72 77 76 66 99 59 62 68 91 89
 51 90 60 85]
[1 1 1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 1
 1 1 0 1 1 1 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0
 1 1 0 0 1 1 0 1 0 1 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0]


### Multi-variate re-randomization

In [None]:
##-------TO-DO--------------

 ## 🔹 6. Cluster Randomization 

In [None]:
n_cluster = 20
sample_size = 500
df = pd.DataFrame({
        "student_id": np.arange(1, sample_size + 1),
        "cluster_id": np.repeat(np.arange(1, n_cluster + 1) , sample_size // n_cluster)
})
cluster_ids = df['cluster_id'].unique()
treatment_clusters = np.random.choice(cluster_ids, size=len(cluster_ids)//2, replace=False)
print(treatment_clusters)
df['group'] = df.cluster_id.apply(lambda c: "treatment" if c in treatment_clusters else "control")
print(df.head(100))


[10  1  8 12  5 11 13 20  3  7]
    student_id  cluster_id      group
0            1           1  treatment
1            2           1  treatment
2            3           1  treatment
3            4           1  treatment
4            5           1  treatment
..         ...         ...        ...
95          96           4    control
96          97           4    control
97          98           4    control
98          99           4    control
99         100           4    control

[100 rows x 3 columns]


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

## Social Media effect and handling of network effects through net

In [35]:
#lets first create a network
n_users = 500
p_connection = 0.1

network = nx.erdos_renyi_graph(n=n_users, p=p_connection, seed=42)

# save the graph information in a dataframe
df = pd.DataFrame({
                    "user_ids": list(network.nodes)
                    })
df["neighbours"] = df['user_ids'].apply(lambda n: list(network.neighbors(n)))
df["n_neighbours"] = df['neighbours'].apply(len)

# assign treatment and control at the user level
df['individual_treatment'] = np.random.choice([0,1], n_users)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
subset_nodes = list(range(30))
subgraph = network.subgraph(subset_nodes)
pos = nx.spring_layout(subgraph, seed=42)  # spring layout = visually appealing placement
nx.draw(
    subgraph,
    pos,
    with_labels=True,
    node_color="skyblue",
    node_size=600,
    edge_color="gray",
    font_size=9,
)
plt.title("Erdős–Rényi Random Graph (n=30, p=0.1)")
plt.show()


In [66]:
# calculate the outcome of the spill over effect using
# outcome = Base rate + independent_treatment_effect_rate * own treatment + neighbor_treatment_effect_rate * fraction of treated neighbors + noise
# Assume:  base rate = 0.05, independent_treatment_effect_rate= 0.2, neighbor_treatment_effect_rate = 0.1

def compute_outcome(df, row, treatment_col):
    frac_treated_neighbors = np.mean([df.loc[n, treatment_col] for n in row['neighbours']]) if row['n_neighbours'] > 0 else 0
    return 0.05 + 0.2 * row[treatment_col] + 0.1 * frac_treated_neighbors + np.random.normal(0, 0.01)



In [67]:
df['outcome_individual'] = df.apply(lambda row: compute_outcome(df, row, treatment_col='individual_treatment'), axis=1)
# Calculate the difference in means of the treatment and control groups
treat_ind_mean = np.mean(df[df['individual_treatment'] == 1]['outcome_individual'])
ctrl_ind_mean = np.mean(df[df['individual_treatment'] == 0]['outcome_individual'])
naive_randomization_effect = treat_ind_mean - ctrl_ind_mean
print(f"Naive randomization mean difference: {naive_randomization_effect}")

Naive randomization mean difference: 0.20106006823711992


In [68]:
# lets do random cluster asssigments
n_clusters = 10
cluster_ids = np.random.choice(range(1, n_clusters), size=n_users)
np.random.shuffle(cluster_ids)
df['cluster_ids'] = cluster_ids

In [73]:
treatment_clusters = np.random.choice(range(1,n_clusters), size=n_clusters//2)
df['cluster_treatment'] = df['cluster_ids'].apply(lambda c_id: 1 if c_id in treatment_clusters else 0)
df['cluster_outcome'] = df.apply(lambda row: compute_outcome(df=df, row=row, treatment_col='cluster_treatment'), axis = 1)


In [74]:
treat_clust_mean = np.mean(df[df['cluster_treatment'] == 1]['cluster_outcome'])
ctrl_clust_mean = np.mean(df[df['cluster_treatment'] == 0]['cluster_outcome'])
cluster_randomization_effect = treat_clust_mean - ctrl_clust_mean
print(f"Naive randomization mean difference: {naive_randomization_effect}")
print(f"Cluster randomization mean difference: {cluster_randomization_effect}")

Naive randomization mean difference: 0.20106006823711992
Cluster randomization mean difference: 0.19823547269830577
