# Creating a simple synthetic dataset using random normal and random gamma distributions

In [5]:
# ============================================
# Dependent Samples Dataset (Before-After Environmental Data)
# Exactly 1000 rows saved in same directory as notebook
# ============================================

import os
import numpy as np
import pandas as pd

def make_before_after_csv(output_name="Forest_Study_raw.csv", n=1000, seed=1442):
    np.random.seed(seed)
    base_dir = os.getcwd()
    output_path = os.path.join(base_dir, output_name)

    regions = np.random.choice(['NorthForest', 'SouthForest', 'EastForest', 'WestForest'], size=n)
    veg = np.random.choice(['Pine','Oak','Mixed','Scrub'], size=n, p=[0.35,0.30,0.25,0.10])
    treatment = np.random.choice(['Control','Intervention'], size=n)

    elevation = np.round(np.random.normal(450, 120, size=n), 1)
    slope = np.round(np.random.gamma(shape=2.0, scale=3.0, size=n), 2)
    canopy_cover = np.clip(np.round(np.random.normal(60, 15, size=n), 1), 0, 100)

    pre_biomass = np.round(np.random.normal(120, 25, size=n), 2)
    decline = np.where(treatment=='Intervention', np.random.normal(12, 5, size=n),
                       np.random.normal(4, 4, size=n))
    post_biomass = np.round(pre_biomass - decline + np.random.normal(0, 3, size=n), 2)

    pre_pm25 = np.round(np.random.normal(22, 8, size=n), 2)
    region_adjust = np.select(
        [regions=='NorthForest', regions=='SouthForest', regions=='EastForest', regions=='WestForest'],
        [1.0, -0.5, 0.3, -0.2], default=0.0
    )
    improvement = np.where(treatment=='Intervention', np.random.normal(5 + region_adjust, 2.0, size=n),
                           np.random.normal(1.5 + 0.5*region_adjust, 1.5, size=n))
    post_pm25 = np.round(np.clip(pre_pm25 - improvement + np.random.normal(0, 1.2, size=n), 2, None), 2)

    pre_soil = np.round(np.clip(np.random.normal(28, 7, size=n), 5, 60), 2)
    post_soil = np.round(np.clip(pre_soil + np.where(treatment=='Intervention', 3.0, 0.5)
                                 + np.random.normal(0, 1.5, size=n), 5, 65), 2)

    pre_temp = np.round(np.random.normal(18.5, 3.2, size=n), 2)
    post_temp = np.round(pre_temp + np.random.normal(0.1, 0.8, size=n), 2)

    df = pd.DataFrame({
        'SiteID': np.arange(1, n+1),
        'Region': regions,
        'VegType': veg,
        'Treatment': treatment,
        'Elevation_m': elevation,
        'Slope_deg': slope,
        'CanopyCover_pct': canopy_cover,
        'Pre_Biomass_Mg_ha': pre_biomass,
        'Post_Biomass_Mg_ha': post_biomass,
        'Pre_PM25_ug_m3': pre_pm25,
        'Post_PM25_ug_m3': post_pm25,
        'Pre_SoilMoisture_pct': pre_soil,
        'Post_SoilMoisture_pct': post_soil,
        'Pre_Temp_C': pre_temp,
        'Post_Temp_C': post_temp
    })
    
    df.to_csv(output_path, index=False)
    print(f"✅ Saved file at: {output_path}")
    return df

# Run this line in your notebook:
make_before_after_csv()


✅ Saved file at: D:\SIDDHARTHA\MASTER'S TAMUCC\MATH 1442 Labs Modernization Project\Github\MATH-1442-Statistics-For-Life-Labs-Modernization-Project\Lab-09\Data_Dependent_Sample\Data_Generation\Forest_Study_raw.csv


Unnamed: 0,SiteID,Region,VegType,Treatment,Elevation_m,Slope_deg,CanopyCover_pct,Pre_Biomass_Mg_ha,Post_Biomass_Mg_ha,Pre_PM25_ug_m3,Post_PM25_ug_m3,Pre_SoilMoisture_pct,Post_SoilMoisture_pct,Pre_Temp_C,Post_Temp_C
0,1,SouthForest,Pine,Intervention,348.3,11.97,46.4,123.19,106.03,27.88,21.81,31.24,35.08,21.58,21.60
1,2,EastForest,Mixed,Control,514.8,3.95,41.8,125.28,115.39,0.13,2.00,28.52,31.01,17.82,18.10
2,3,WestForest,Mixed,Intervention,406.1,6.06,74.0,141.82,130.69,14.35,9.84,24.62,26.12,14.63,14.96
3,4,WestForest,Scrub,Control,393.2,16.18,57.9,96.71,96.01,22.76,20.02,29.12,29.09,19.20,18.80
4,5,SouthForest,Mixed,Control,567.1,5.63,38.0,90.31,72.56,22.25,22.76,17.42,18.30,17.22,16.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,EastForest,Pine,Control,408.4,2.06,26.8,118.22,115.71,13.30,10.15,19.88,20.57,20.45,20.46
996,997,SouthForest,Oak,Control,542.5,3.95,52.2,151.99,142.99,26.63,26.02,21.20,21.67,16.81,17.08
997,998,NorthForest,Oak,Control,513.3,2.34,47.8,148.27,146.29,18.54,16.82,28.55,31.61,21.47,22.63
998,999,EastForest,Oak,Control,416.7,3.79,56.8,136.07,130.17,10.56,8.53,22.62,20.55,20.18,20.64
