# Data Analysis

In [1]:
import os
import pandas as pd
import statsmodels.formula.api as smf

## Load the master dataset

In [2]:
custom_data_directory = '../../data/custom_data'
master_df = pd.read_csv(os.path.join(custom_data_directory, 'master.csv'))

In [15]:
master_df.columns.tolist()

['Unnamed: 0',
 'year',
 'commname',
 'region',
 'subreg',
 'adj_households_in_community',
 'relying_on_substinence',
 'IncorporationType',
 'BoroughCensusArea',
 'NativeRegionalHealthCarePro',
 'SchoolDistrict',
 'ClimateRegion',
 'EnergyRegion',
 'EconomicRegion',
 'sqMiLand',
 'sqMiWater',
 'resource',
 'using',
 'trying',
 'harvesting',
 'giving',
 'receving',
 'units_harvested',
 'pounds_harvested',
 'harv_mes_units',
 'units_harvested_by_sample',
 'lbs_harvested_percapita',
 'lbs_harvested_by_sample',
 'lbs_used_percapita',
 'perc_contribution_to_harvest',
 'median_lbs_wildfood_use_percapita',
 'top_lbs_wildfood_use_percapita',
 'labor_force',
 'employed',
 'unemployed',
 'unemployment_rate',
 'median_age',
 'n_pop_under_14',
 'n_pop_under_18',
 'n_pop_18-65',
 'n_pop_65-85',
 'n_pop_over_85',
 'n_families',
 'median_family_income',
 'n_households',
 'n_houses',
 'n_occupied_houses',
 'house_vacancy_rate',
 'n_people_below_poverty',
 'n_households_below_poverty',
 'n_people_under

# Selecting columns for analysis

In [108]:
an_df = master_df[['year',
                    'commname',
                    'adj_households_in_community',
                    'relying_on_substinence',
                    'BoroughCensusArea',
                    'sqMiLand',
                    'sqMiWater',
                    'resource',
                    'using',
                    'trying',
                    'harvesting',
                    'giving',
                    'receving',
                    'units_harvested',
                    'pounds_harvested',
                    'harv_mes_units',
                    'units_harvested_by_sample',
                    'lbs_harvested_percapita',
                    'lbs_harvested_by_sample',
                    'lbs_used_percapita',
                    'perc_contribution_to_harvest',
                    #'median_lbs_wildfood_use_percapita',
                    #'top_lbs_wildfood_use_percapita',
                    'labor_force',
                    'employed',
                    'unemployed',
                    'unemployment_rate',
                    'median_age',
                    'n_families',
                    'median_family_income',
                    'n_households',
                    'n_houses',
                    'n_occupied_houses',
                    'house_vacancy_rate',
                    'n_total_population(ACS)',
                    'p_people_below_poverty',
                    'p_households_below_poverty',
                    'p_population_native',
                    'p_population_non_native',
                    'p_population_white',
                    'p_population_black',
                    'p_population_asian',
                    'ratio_male_over_female',
                    'local_tax_burden',
                    'federal_tax_burden',
                    'Gini',
                    'AGI',
                    'pop_density',
                    #'pop_water_density'
                    ]]

In [84]:
# Filter for years 2012-2022
an_df = an_df[(an_df["year"] >= 2011) & (an_df["year"] <= 2018)]
an_df.sort_values(by='year', inplace=True)
# an_df = an_df.dropna()

In [116]:
an_df.to_csv(os.path.join(custom_data_directory, 'analysis.csv'))

In [97]:
an_df = an_df[(an_df["year"] >= 2012) & (an_df["year"] <= 2018)]
continuous_communities = an_df.groupby("commname")["year"].nunique()
print(f"Communities with full data 2012-2018: {sum(continuous_communities == 7)}")


Communities with full data 2012-2018: 0


In [70]:
# Filter for years 2012-2022
filtered_df = an_df[(an_df["year"] >= 2012) & (an_df["year"] <= 2018)].dropna()

# Group by community and list unique years
community_years = filtered_df.groupby("commname")["year"].apply(lambda x: sorted(x.unique()))

# Find communities that have all years from 2012 to 2022
full_continuous_communities = community_years[community_years.apply(lambda years: years == list(range(2012, 2019)))]

# Count number of communities with full continuity
num_full_continuous_communities = len(full_continuous_communities)

# Print result
print("Number of communities with continuous data from 2012 to 2022:", num_full_continuous_communities)


Number of communities with continuous data from 2012 to 2022: 0


In [41]:
an_df = an_df.copy()  # Creates a new independent DataFrame

# Convert 'year' to numeric and avoid the warning
an_df.loc[:, "year"] = pd.to_numeric(an_df["year"], errors="coerce")  


In [71]:
import pandas as pd
from itertools import groupby

# Ensure 'year' exists and is properly formatted
if "year" not in an_df.columns:
    raise ValueError("Column 'year' is missing from the dataset.")

an_df["year"] = pd.to_numeric(an_df["year"], errors="coerce")  # Convert to numeric in case it's string
an_df = an_df.dropna(subset=["year"])  # Drop NaN years
an_df["year"] = an_df["year"].astype(int)  # Ensure it's integer

# Drop rows with NaNs but keep "year" (so we don't mistakenly drop it)
filtered_df = an_df.dropna()

# Get unique years per community
community_years = filtered_df.groupby("commname")["year"].unique().apply(sorted)

# Function to find the longest continuous span
def longest_continuous_span(years):
    spans = [sum(1 for _ in group) for _, group in groupby(enumerate(years), lambda x: x[0] - x[1])]
    return max(spans) if spans else 0

# Apply function to each community
community_spans = community_years.apply(longest_continuous_span)

# Find communities with at least 8 years of continuous data
valid_communities = community_spans[community_spans >= 8]

# Print results
print("\nNumber of communities with at least 8 years of continuous data:", len(valid_communities))
print("Max continuous span found:", community_spans.max())
print("Communities with at least 8 years:", valid_communities.index.tolist())



Number of communities with at least 8 years of continuous data: 0
Max continuous span found: 3
Communities with at least 8 years: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  an_df["year"] = pd.to_numeric(an_df["year"], errors="coerce")  # Convert to numeric in case it's string


In [99]:
to_run = an_df[(an_df["year"] >= 2012) & (an_df["year"] <= 2018)].dropna()


In [109]:
df_to_run = an_df[["trying", 
                       "local_tax_burden", 
                       "federal_tax_burden",
                       "unemployment_rate",
                       "AGI", 
                       # "Gini", 
                       "labor_force", 
                       "ratio_male_over_female",
                       "p_population_non_native",
                       "BoroughCensusArea", 
                       "year",
                      "resource",
                      "labor_force"]]
df_to_run = df_to_run.dropna()

# Run OLS with community (commname) and year fixed effects
# model = smf.ols(
#     "median_lbs_wildfood_use_percapita ~ local_tax_burden + federal_tax_burden + AGI_clean + n_people_below_poverty + unemployment_rate + ratio_male_over_female + C(commname) + C(year)",
#     data=df_to_run
# ).fit()

model = smf.ols(
    "trying ~ local_tax_burden + federal_tax_burden + AGI + labor_force + unemployment_rate + ratio_male_over_female + p_population_non_native + C(commname) + C(year) + C(resource)",
    data=df_to_run
).fit()

print(model.summary())


PatsyError: Error evaluating factor: NameError: name 'commname' is not defined
    trying ~ local_tax_burden + federal_tax_burden + AGI + labor_force + unemployment_rate + ratio_male_over_female + p_population_non_native + C(commname) + C(year) + C(resource)
                                                                                                                                                ^^^^^^^^^^^

In [113]:
import numpy as np

df_to_run["trying"] = np.log1p(df_to_run["trying"])

model = smf.ols(
    "trying ~ AGI + C(BoroughCensusArea) * C(year) + C(resource)",
    data=df_to_run
).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                 trying   R-squared:                       0.743
Model:                            OLS   Adj. R-squared:                  0.701
Method:                 Least Squares   F-statistic:                     18.03
Date:                Fri, 28 Feb 2025   Prob (F-statistic):           1.32e-95
Time:                        11:45:37   Log-Likelihood:                 670.59
No. Observations:                 530   AIC:                            -1193.
Df Residuals:                     456   BIC:                            -877.0
Df Model:                          73                                         
Covariance Type:            nonrobust                                         
                                                                                coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------