In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import LogLocator, ScalarFormatter
from matplotlib.patches import Rectangle

In [None]:
# Set up directory paths
from pathlib import Path
root = Path.cwd().parent

DATA_DIR = root/'Dataset'
OUTPUT=root/'Results'/'Figures'

df = pd.read_csv(DATA_DIR/"2024_swpa_dataset.csv", header=0)

df_produced = pd.read_excel(DATA_DIR/"USGS_produced_water_data_2023.xlsx", header=0)

In [None]:
df['Sample Group'].unique()

In [None]:
# Extract produced water data for SWPA
df_produced_pa = df_produced[df_produced['STATE'] == 'Pennsylvania'].copy()
df_produced_pa = df_produced[df_produced['BASIN']=='Appalachian']

df_produced_pa = df_produced_pa[df_produced_pa['COUNTY'].isin(['Greene','Washington',
    'Westmoreland', 'Beaver', 'Allegheny', 'Armstrong',  'Butler', 'Fayette', 'Indiana', 'Lawrence'
    ])]

df_produced_pa.loc[:,'Group'] = 'PW'

# Exclude stream and AMD or rainwater affected water samples from 2024 swpa dataset
df_test = df[~df['Source'].str.contains('stream')]
df_test = df_test.loc[~df_test['Site ID'].isin(['Sample_022', 'Sample_029', 'Sample_091']),:]
# df_test = df_test.dropna(subset=['Cl','Br'])

# control
control = df_test[df_test['Sample Group'].isin(['C'])].copy()
control.loc[:,'Group'] = 'Control'

# hotspot
hotspot = df_test[df_test['Sample Group'].isin(['HS','HS/I','HS/S'])].copy()
hotspot.loc[:,'Group'] = 'Hotspot'

# impoundment
impoundment = df_test[df_test['Sample Group'].isin(['I','HS/I'])].copy()
impoundment.loc[:,'Group'] = 'Impoundment'

# spill
spill = df_test.loc[df_test['Sample Group'].isin(['S','HS/S']),:].copy()
spill.loc[:,'Group'] = 'Spill'

In [None]:
df_produced_pa.shape

In [None]:
# Combine all data
df_combined = pd.concat([control[['Cl','Br','Group']], 
                            hotspot[['Cl','Br','Group']], 
                            impoundment[['Cl','Br','Group']], 
                            spill[['Cl','Br','Group']],
                            df_produced_pa[['Cl','Br','Group']]], ignore_index=True)

df_combined['Cl/Br'] = df_combined['Cl'] / df_combined['Br']



# Figure 2D
****

In [None]:
# Color palette for boxplot
group_color = {
    'PW':  'gray',   #"#4d4d4d",
    'Control': "#91bfdb",
    'Impoundment': "#fee090",
    'Spill': "#fc8d59",
    'Hotspot': "#d73027",
    'Extra': 'blue'
}

# Make boxplots
fig, ax = plt.subplots(figsize=(8,6))
bp = sns.boxplot(x='Group', y='Cl/Br', data=df_combined, hue='Group', palette=group_color, legend=False,
            flierprops=dict(marker='o', markersize=2, linestyle='none'),  # Small dots for outliers
            capprops=dict(linewidth=0), ax=ax)  # Remove whisker caps
plt.xlabel('Group')
plt.ylabel('Cl/Br Mass Ratio')
plt.ylim(-200, 5200)  # Extended range with padding
# Remove grid lines
plt.grid(False)

# Add black border around the plot
for spine in ax.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1)

# Remove ticks
ax.tick_params(left=False, bottom=False)

# Add sample counts (n=) next to the top of each box
group_counts = df_combined['Group'].value_counts()
group_order = [tick.get_text() for tick in ax.get_xticklabels()]

# Get the 75th percentile (top of box) positions for each box
for i, group in enumerate(group_order):
    if group in group_counts:
        n = group_counts[group]
        # Get 75th percentile (top of box) for this group
        q75 = df_combined[df_combined['Group'] == group]['Cl/Br'].quantile(0.75)
        # Place text to the right of the box, slightly above the top
        ax.text(i+0.1 , q75 + 100, f'n={n}', 
                ha='left', va='center', fontsize=12)

# Clean background (white)
ax.set_facecolor('white')
plt.tight_layout()
plt.savefig(OUTPUT/"Figure 2D.pdf", dpi=600, bbox_inches="tight", pad_inches=0.2)
plt.show()

# Figure 2C
****

In [None]:
# mixing lines
halite_mixing = pd.DataFrame({
    'Cl/Br': [18, 68, 93, 143, 267, 392, 516, 1986, 2469, 2947, 4825, 35726, 100000],
    'Cl': [0.18, 0.679982, 0.929973, 1.429955, 2.67991, 3.929865, 5.17982, 20.17928, 25.1791, 30.17892, 50.1782, 500.162, 5000]
})

app_brine_mixing = pd.DataFrame({
    'Cl/Br': [18,92, 94, 97, 98, 99, 99, 100, 100, 100, 100, 100, 100],
    'Cl': [0.18, 9.179982, 13.679973, 22.679955, 45.17991, 67.679865, 90.17982, 360.17928, 450.1791, 
    540.17892, 900.1782, 9000.162, 90000]
})


In [None]:
halite_mixing['Group'] = 'Halite Mixing'
app_brine_mixing['Group'] = 'Appalachian Brine Mixing'

group_marker = {
    'PW': 's',
    'Control': 'P',
    'Impoundment': 'D',
    'Spill': 'D',
    'Hotspot': 'D',
    'Extra': 'o'
}

In [None]:
# Regroup to avoid overlap in plots

# control
control_2c = df_test[df_test['Sample Group'].isin(['C'])].copy()
control_2c.loc[:,'Group'] = 'Control'

# hotspot
hotspot_2c = df_test[df_test['Sample Group'].isin(['HS','HS/I','HS/S'])].copy()
hotspot_2c.loc[:,'Group'] = 'Hotspot'

# impoundment
impoundment_2c = df_test[df_test['Sample Group'].isin(['I'])].copy()
impoundment_2c.loc[:,'Group'] = 'Impoundment'

# spill
spill_2c = df_test.loc[df_test['Sample Group'].isin(['S']),:].copy()
spill_2c.loc[:,'Group'] = 'Spill'

df_combined_2c = pd.concat([control_2c[['Cl','Br','Group']], 
                            hotspot_2c[['Cl','Br','Group']], 
                            impoundment_2c[['Cl','Br','Group']], 
                            spill_2c[['Cl','Br','Group']]],
                            ignore_index=True)

df_combined_2c['Cl/Br'] = df_combined_2c['Cl'] / df_combined_2c['Br']

In [None]:
impoundment_2c

In [None]:
# Plot mixing lines
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the mixing lines
plt.plot(halite_mixing['Cl'], halite_mixing['Cl/Br'], 
        label='Halite Mixing', color='pink', linewidth=1, linestyle='--')
plt.plot(app_brine_mixing['Cl'], app_brine_mixing['Cl/Br'], 
        label='Appalachian Brine Mixing', color='black', linewidth=1, linestyle='--')

# Plot the swpa samples
for group in df_combined_2c['Group'].unique():
    group_data = df_combined_2c[df_combined_2c['Group'] == group]
    if (len(group_data) > 0) & (group!='PW') :
        # Control samples have no edge color, others have black edge
        edge_color = 'none' if group == 'Control' else 'black'
        edge_width = 0 if group == 'Control' else 0.5
        marker_size = 80 if group == 'Control' else 50
        
        plt.scatter(group_data['Cl'], group_data['Cl/Br'], 
                label=group, 
                color=group_color.get(group, 'black'), 
                marker=group_marker.get(group, 'o'), 
                edgecolor=edge_color,
                linewidth=edge_width,
                s=marker_size
                )

# Set log scale for both axes
plt.xscale('log')
plt.yscale('log')

# Set x-axis to start from 1 and format as regular numbers
plt.xlim(1, 100000)  # Start from 1
ax.xaxis.set_major_formatter(ScalarFormatter())
ax.xaxis.get_major_formatter().set_scientific(False)

plt.ylim(1, 100000)  # Start from 1
ax.yaxis.set_major_formatter(ScalarFormatter())
ax.yaxis.get_major_formatter().set_scientific(False)


# Labels and styling
plt.xlabel('Cl (mg/L)', fontsize=12)
plt.ylabel('Cl/Br Mass Ratio', fontsize=12)
plt.legend()
# plt.grid(True, alpha=0.3)

# Add black border
for spine in ax.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1)


# Set tick marks: major ticks both inside and outside, minor ticks insideplt.show()

ax.tick_params(axis='both', which='major', direction='inout', length=6)
plt.tight_layout()
ax.tick_params(axis='both', which='minor', direction='in', length=3)
plt.savefig(OUTPUT/"Figure 2C.pdf", dpi=600, bbox_inches="tight", pad_inches=0.2)
# plt.show()