# Dependencies

In [1]:
import pandas as pd
import scipy.stats as stats
import sqlite3
import os

# Query Table

In [2]:
def query_vw_groundwater(query):
    db_path = os.path.join('../Resources', 'Groundwater.db')
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

In [3]:
query = "SELECT * FROM measurements_gwe_avg_percent_change_table"
df = query_vw_groundwater(query)

# Create buckets

In [4]:
df['percent_change_gwe_bucket'] = pd.qcut(df['percent_change_gwe'], q=5, labels=['Low', 'Low-Medium', 'Medium', 'Medium-High', 'High'])

### Create a contingency table

In [5]:
df['percent_change_gwe_bucket'] = pd.qcut(df['percent_change_gwe'], q=5, labels=['Low', 'Low-Medium', 'Medium', 'Medium-High', 'High'])
contingency_table = pd.crosstab(df['percent_change_gwe_bucket'], df['well_use'])

chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print(f"Chi-squared: {chi2}")
print(f"p-value: {p}")
print(f"Degrees of freedom: {dof}")
print(f"Expected frequencies: \n{expected}")

Chi-squared: 32929.70504279969
p-value: 0.0
Degrees of freedom: 28
Expected frequencies: 
[[ 1022.95409885 39476.4355019  47019.81510673  2419.94342376
   3382.23771494 13118.03608285   996.15092802  8288.42714294]
 [ 1187.28229955 45817.96306732 54573.11747127  2808.68515623
   3925.56320594 15225.32835395  1156.17344497  9619.88700077]
 [ 1216.67397611 46952.20616334 55924.0981255   2878.21534775
   4022.74218698 15602.23781072  1184.79500866  9858.03138094]
 [  677.0427944  26127.50292041 31120.09331388  1601.64103136
   2238.53609504  8682.18018438   659.30309943  5485.7005611 ]
 [  972.04683108 37511.89234702 44679.87598261  2299.51504091
   3213.9207971  12465.2175681    946.57751892  7875.95391426]]


### Create expected_table

In [6]:
columns = contingency_table.columns.tolist()
#removed = columns.pop(0)
columns
expected_table = pd.DataFrame(expected, columns=columns)

### expected_table

In [7]:
expected_table.head()

Unnamed: 0,Industrial,Irrigation,Observation,Other,Public Supply,Residential,Stockwatering,Unknown
0,1022.954099,39476.435502,47019.815107,2419.943424,3382.237715,13118.036083,996.150928,8288.427143
1,1187.2823,45817.963067,54573.117471,2808.685156,3925.563206,15225.328354,1156.173445,9619.887001
2,1216.673976,46952.206163,55924.098126,2878.215348,4022.742187,15602.237811,1184.795009,9858.031381
3,677.042794,26127.50292,31120.093314,1601.641031,2238.536095,8682.180184,659.303099,5485.700561
4,972.046831,37511.892347,44679.875983,2299.515041,3213.920797,12465.217568,946.577519,7875.953914


### contingency_table

In [8]:
contingency_table.head()

well_use,Industrial,Irrigation,Observation,Other,Public Supply,Residential,Stockwatering,Unknown
percent_change_gwe_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Low,821,55639,37461,1858,2111,8125,187,9522
Low-Medium,1154,44674,50542,3352,4428,19347,891,9926
Medium,1602,29529,67599,3218,3540,19453,3392,9306
Medium-High,644,23947,30329,1960,3641,10302,354,5415
High,855,42097,47386,1620,3063,7866,119,6959


In [9]:
import pandas as pd

# Data for expected_table
expected_data = {
    'Industrial': [1022.954099, 1187.282300, 1216.673976, 677.042794, 972.046831],
    'Irrigation': [39476.435502, 45817.963067, 46952.206163, 26127.502920, 37511.892347],
    'Observation': [47019.815107, 54573.117471, 55924.098126, 31120.093314, 44679.875983],
    'Other': [2419.943424, 2808.685156, 2878.215348, 1601.641031, 2299.515041],
    'Public Supply': [3382.237715, 3925.563206, 4022.742187, 2238.536095, 3213.920797],
    'Residential': [13118.036083, 15225.328354, 15602.237811, 8682.180184, 12465.217568],
    'Stockwatering': [996.150928, 1156.173445, 1184.795009, 659.303099, 946.577519],
    'Unknown': [8288.427143, 9619.887001, 9858.031381, 5485.700561, 7875.953914]
}
expected_table = pd.DataFrame(expected_data)

# Data for contingency_table (observed values)
contingency_data = {
    'percent_change_gwe': ['Low', 'Low-Medium', 'Medium', 'Medium-High', 'High'],
    'Industrial': [821, 1154, 1602, 644, 855],
    'Irrigation': [55639, 44674, 29529, 23947, 42097],
    'Observation': [37461, 50542, 67599, 30329, 47386],
    'Other': [1858, 3352, 3218, 1960, 1620],
    'Public Supply': [2111, 4428, 3540, 3641, 3063],
    'Residential': [8125, 19347, 19453, 10302, 7866],
    'Stockwatering': [187, 891, 3392, 354, 119],
    'Unknown': [9522, 9926, 9306, 5415, 6959]
}
contingency_table = pd.DataFrame(contingency_data).set_index('percent_change_gwe')

# Rename columns
expected_table.columns = [f"{col}_e" for col in expected_table.columns]
contingency_table.columns = [f"{col}_c" for col in contingency_table.columns]

# Round and eliminate decimals in expected_table
expected_table = expected_table.round(0).astype(int)

# Reset index of expected_table to make concatenation easier
expected_table = expected_table.reset_index(drop=True)

# Create a combined dataframe by joining the two dataframes
combined_table = pd.concat([expected_table, contingency_table.reset_index()], axis=1)

# Reorder the columns to interleave them
interleaved_columns = [val for pair in zip(expected_table.columns, contingency_table.columns) for val in pair]
combined_table = combined_table[interleaved_columns]

# Format all numeric data with commas as thousand separators
combined_table = combined_table.applymap(lambda x: f"{x:,}" if isinstance(x, (int, float)) else x)

# Define the new index labels
new_index_labels = ['Low', 'Low-Medium', 'Medium', 'Medium-High', 'High']

# Rename the index using the new labels
combined_table.index = new_index_labels

display(combined_table)

#For chi-squared analysis, the expect table (tablename_e) has the values that should exist, while the contingency table (tablename_c) contains the actual values from the data.

Unnamed: 0,Industrial_e,Industrial_c,Irrigation_e,Irrigation_c,Observation_e,Observation_c,Other_e,Other_c,Public Supply_e,Public Supply_c,Residential_e,Residential_c,Stockwatering_e,Stockwatering_c,Unknown_e,Unknown_c
Low,1023,821,39476,55639,47020,37461,2420,1858,3382,2111,13118,8125,996,187,8288,9522
Low-Medium,1187,1154,45818,44674,54573,50542,2809,3352,3926,4428,15225,19347,1156,891,9620,9926
Medium,1217,1602,46952,29529,55924,67599,2878,3218,4023,3540,15602,19453,1185,3392,9858,9306
Medium-High,677,644,26128,23947,31120,30329,1602,1960,2239,3641,8682,10302,659,354,5486,5415
High,972,855,37512,42097,44680,47386,2300,1620,3214,3063,12465,7866,947,119,7876,6959


In [10]:
import pandas as pd
import numpy as np

# Your provided code up to combined_table creation...

# Remove formatting and convert back to numeric
combined_table_numeric = combined_table.replace({',': ''}, regex=True).apply(pd.to_numeric, errors='ignore')

# Compute the absolute differences between expected and contingency values
combined_table_diff = combined_table_numeric.copy()
for col in expected_table.columns:
    combined_table_diff[col.replace('_e', '_diff')] = combined_table_numeric[col.replace('_e', '_e')] - combined_table_numeric[col.replace('_e', '_c')]

# Rank the absolute differences from highest to lowest along the rows
df_abs_diff = combined_table_diff.filter(regex='_diff$').abs()
rank_columns = df_abs_diff.rank(method='dense', axis=1, ascending=False)

# Create a new column 'Rank' with the ranks for each row
combined_table_diff['Rank'] = rank_columns.min(axis=1)

# Define color mapping function
def color_diff(val):
    if val == combined_table_diff['Rank'].min():
        return 'background-color: lightgreen'
    elif val == combined_table_diff['Rank'].max():
        return 'background-color: tomato'
    elif val <= 3:
        color = np.interp(val, [1, 3], [255, 0])
        return f'background-color: rgb({255}, {int(color)}, 0)'
    else:
        color = np.interp(val, [3, combined_table_diff['Rank'].max()], [255, 0])
        return f'background-color: rgb({int(color)}, 165, 0)'

# Apply the color mapping function to the dataframe
styled_table = combined_table_diff.style.applymap(color_diff, subset=['Rank'])

styled_table

Unnamed: 0,Industrial_e,Industrial_c,Irrigation_e,Irrigation_c,Observation_e,Observation_c,Other_e,Other_c,Public Supply_e,Public Supply_c,Residential_e,Residential_c,Stockwatering_e,Stockwatering_c,Unknown_e,Unknown_c,Industrial_diff,Irrigation_diff,Observation_diff,Other_diff,Public Supply_diff,Residential_diff,Stockwatering_diff,Unknown_diff,Rank
Low,1023,821,39476,55639,47020,37461,2420,1858,3382,2111,13118,8125,996,187,8288,9522,202,-16163,9559,562,1271,4993,809,-1234,1.0
Low-Medium,1187,1154,45818,44674,54573,50542,2809,3352,3926,4428,15225,19347,1156,891,9620,9926,33,1144,4031,-543,-502,-4122,265,-306,1.0
Medium,1217,1602,46952,29529,55924,67599,2878,3218,4023,3540,15602,19453,1185,3392,9858,9306,-385,17423,-11675,-340,483,-3851,-2207,552,1.0
Medium-High,677,644,26128,23947,31120,30329,1602,1960,2239,3641,8682,10302,659,354,5486,5415,33,2181,791,-358,-1402,-1620,305,71,1.0
High,972,855,37512,42097,44680,47386,2300,1620,3214,3063,12465,7866,947,119,7876,6959,117,-4585,-2706,680,151,4599,828,917,1.0


# Chi-squared interpretation

1. Chi-squared Statistic

    Chi-squared: 336.11903653213204

    This is the chi-squared test statistic. It measures the discrepancy between the observed frequencies and the expected frequencies under the null hypothesis. A higher value indicates a greater discrepancy.

2. p-value

    p-value: 1.5281004230786005e-54

    The p-value represents the probability of observing a chi-squared statistic at least as extreme as the one calculated, under the null hypothesis. In this case, the p-value is extremely small (essentially zero for practical purposes), which suggests that the observed data is highly unlikely under the null hypothesis.

3. Degrees of Freedom

    Degrees of freedom: 28

    The degrees of freedom for a chi-squared test are typically calculated as (number of rows - 1) * (number of columns - 1) for a contingency table, or in other contexts, it could be related to the number of categories minus one. Here, 28 degrees of freedom indicate the complexity of the test given the number of categories or the structure of the data.

Interpretation

Given these results:

    High Chi-squared value: 336.12 is very high for a chi-squared statistic, indicating a large deviation between observed and expected frequencies.
    Extremely low p-value: 1.53e-54 suggests that the probability of obtaining such a chi-squared value (or higher) under the null hypothesis is extremely low.
    Degrees of freedom: 28 provides context for the chi-squared distribution used to determine the p-value.

Conclusion

Since the p-value is far below common significance levels (e.g., 0.05, 0.01), we reject the null hypothesis. This means there is strong evidence that the observed frequencies are not consistent with the expected frequencies under the null hypothesis. In other words, the differences between the observed and expected data are statistically significant, indicating a potentially meaningful effect or association in the context of your data.
Additional Considerations

    Context: It's important to consider the context of the test. What are the observed and expected frequencies? What hypothesis were you testing?
    Effect Size: While the test indicates statistical significance, consider the practical significance and the effect size to understand the real-world implications of the findings.
    Assumptions: Ensure that the assumptions for the chi-squared test (such as expected frequency counts) are met to validate the results.

# Expected versus 

Analysis:
Chi-squared Statistic and p-value:

    Chi-squared: 336.12
    p-value: 1.53e-54

These values indicate a highly significant result, suggesting that the observed distribution of well use across the percent change GWE buckets is significantly different from the expected distribution.
Degrees of Freedom:

    Degrees of freedom: 28

This is calculated as (number of rows - 1) * (number of columns - 1). Here, 5 rows (percent change GWE buckets) and 8 columns (well use categories) give (5-1) * (8-1) = 4 * 7 = 28 degrees of freedom.
Interpretation:

    Significant Deviation: The extremely low p-value indicates that the observed frequencies are significantly different from the expected frequencies. There is a significant association between the percent change GWE buckets and the well use categories.
    Category Analysis:
        Irrigation: Observed values deviate greatly from expected values across all buckets. For instance, in bucket 1, 483 observed vs. 341.89 expected; in bucket 2, 424 observed vs. 410.33 expected.
        Observation: Large deviations are seen especially in buckets 1 (374 observed vs. 475.84 expected) and 3 (645 observed vs. 340.27 expected).
        Residential: Bucket 2 has a significant deviation (193 observed vs. 146.32 expected).
        Unknown: Bucket 1 has a notable deviation (84 observed vs. 55.75 expected).

Practical Implications:

    Policy and Planning: The significant associations suggest that different well use categories respond differently to changes in groundwater levels (GWE buckets). Policies and resource allocation might need to be tailored based on well use categories.
    Further Investigation: It may be worthwhile to explore why certain categories, such as Irrigation and Observation, show large deviations in specific buckets. This could involve investigating specific regional practices or environmental factors affecting these categories.

Conclusion:

The chi-squared test reveals a significant relationship between percent change in groundwater levels and well use categories. This indicates that groundwater level changes do not uniformly affect all types of well uses. Consequently, water resource management strategies should consider these differences to effectively address the needs and impacts on various well use categories.

# Chi-squared (χ2χ2)
- The Chi-squared (χ2χ2) value is a statistic used to measure how much the observed counts in a contingency table deviate from the expected counts under the null hypothesis. It is commonly used in the Chi-squared test for independence and the Chi-squared goodness of fit test. Here’s what the 
- Purpose: To test whether there is a significant association between two categorical variables.
# p-value
- The p-value is a key concept in statistical hypothesis testing, including the Chi-squared test. It represents the probability of obtaining test results at least as extreme as the observed data, assuming that the null hypothesis is true.

# Chi-squared Value of 0.0

    Chi-squared (χ2χ2) value: This value measures the difference between observed and expected frequencies in each category. A χ2χ2 value of 0.0 means that the observed frequencies are exactly equal to the expected frequencies under the null hypothesis. In other words, there is no deviation at all between the observed and expected counts.

# p-value of 1.0

    p-value: The p-value represents the probability of obtaining a test statistic at least as extreme as the one observed, under the assumption that the null hypothesis is true. A p-value of 1.0 indicates that there is a 100% probability of observing a Chi-squared value of 0.0 (or more extreme) if the null hypothesis is true.
        Interpretation: This means that the observed data is perfectly consistent with the null hypothesis. In other words, there is no evidence to suggest any association between the variables. The observed counts match the expected counts exactly as they would if the variables were independent.