# ARCGIS

In [1]:
import os
import pandas as pd

In [2]:
arcgis_path = '../../data/arcgis'

## Merging
I will directly merge the manipulated and cleaned files into the master

In [6]:
custom_data_directory = '../../data/custom_data'

In [7]:
master_arcgis = pd.DataFrame()

## Name Match
Since they aren't homogenous I will namematch them with fuzzy logic

In [8]:
from fuzzywuzzy import process, fuzz

def clean_location_name(name):
    '''
    Cleans location names by removing common suffixes that skew fuzzy matching.
    '''
    ignore_suffixes = ["census designated place", "city", "road", "street", "highway", "borough", "area", "island"]
    for suffix in ignore_suffixes:
        if name.lower().endswith(suffix):
            name = name.rsplit(suffix, 1)[0].strip()
    return name

def match_names(trusted_df, trusted_col, other_df, other_col):
    '''
    Matches names between two dataframe columns, automatically rejecting low-confidence matches.
    
    Returns:
    - Matched DataFrame with original and final names
    - Unmatched names for manual review
    '''

    # Clean and strip names
    trusted_unique = set(trusted_df[trusted_col].astype(str).str.strip().str.lower().apply(clean_location_name).unique())
    other_unique = set(other_df[other_col].astype(str).str.strip().str.lower().apply(clean_location_name).unique())

    matched_other_to_trusted = {}
    unmatched_names = []

    for name in other_unique:
        for trusted_name in trusted_unique:
            if trusted_name in name or name in trusted_name:
                matched_other_to_trusted[name] = trusted_name
                break
        else:
            match, score = process.extractOne(name, list(trusted_unique), scorer=fuzz.token_sort_ratio)
            
            if score >= 95:  # Almost perfect match
                matched_other_to_trusted[name] = match
            else:
                unmatched_names.append(name)  # Log unmatched names

    matched_df = pd.DataFrame(list(matched_other_to_trusted.items()), columns=["Original", "Matched"])
    unmatched_df = pd.DataFrame(unmatched_names, columns=["Unmatched"])

    return matched_df, unmatched_df



## Borough and census area
This file is peculiar because it serves as bridge between others. I will save it as its own because of the flexibility this brings in the master merge operations (being able to merge and manipulate data without the whole arcgis overhead.

In [9]:
comm_and_reg = pd.read_csv(os.path.join(arcgis_path, 'Community_Regions_Overview.csv'))
comm_and_reg.columns

Index(['X', 'Y', 'OBJECTID', 'CommunityID', 'CommunityName',
       'IncorporationType', 'AreaType', 'BoroughCensusArea',
       'ANCSARegionalCorporation', 'NativeRegionalHealthCarePro',
       'ANCSARegionalNonProfit', 'SchoolDistrict', 'DCRAAlaskaRegion',
       'ClimateRegion', 'EnergyRegion', 'EconomicRegion', 'ARDORRegion',
       'RecordersOffice', 'Section', 'Township', 'Range', 'Meridian',
       'sqMiLand', 'sqMiWater'],
      dtype='object')

In [10]:
comm_and_reg_columns_to_keep = ['CommunityName',
       'IncorporationType', 'BoroughCensusArea',
       'NativeRegionalHealthCarePro',
       'SchoolDistrict', 'ClimateRegion', 'EnergyRegion', 'EconomicRegion',
       'sqMiLand', 'sqMiWater']
comm_and_reg = comm_and_reg[comm_and_reg_columns_to_keep]

In [11]:
custom_data_directory = '../../data/custom_data'
comm_and_reg.to_csv(os.path.join(custom_data_directory, 'comm_and_reg_custom.csv'))

## ACS Files

Files being studied and merged:
- Household income and benefits: https://dcra-cdo-dcced.opendata.arcgis.com/datasets/5b535254c9f649f9b7ffe52a476052c4_3/explore
- Family Income and Benefits: https://dcra-cdo-dcced.opendata.arcgis.com/datasets/5b535254c9f649f9b7ffe52a476052c4_3/explore?location=50.133297%2C34.600000%2C3.89
- Income and Poverty: https://dcra-cdo-dcced.opendata.arcgis.com/datasets/f4b63e8b2c3042a98130505df1422730_9/explore?location=50.133297%2C34.600000%2C3.89
- Population Characteristics Housing: https://dcra-cdo-dcced.opendata.arcgis.com/datasets/14f0b44db2da4f55b2adf7bb5602bead_6/explore
- Population Characteristics Race: https://dcra-cdo-dcced.opendata.arcgis.com/datasets/30ab75bf594f48349c4ff536957869f8_1/explore?location=42.493141%2C34.600000%2C3.48
- Population Characteristics Sex: https://dcra-cdo-dcced.opendata.arcgis.com/datasets/fa3ad93528b24f7c9ba298a68c04920d_0/explore?location=45.540112%2C34.600000%2C3.63
- Population Characteristics Age: https://dcra-cdo-dcced.opendata.arcgis.com/datasets/63cb9ba34b914f7c89a2394b47d4319e_5/explore?location=46.265046%2C34.600000%2C3.67

### Timing convention
Since the data spans 5 year intervals, being the mean of these intervals, I just take the end year as representation.This isn't perfect, but the best way to not go assume linearity or growth distributions.

### Merging into master_arcgis

In [12]:
age_df = pd.read_csv(os.path.join(arcgis_path, 'ACS_Population_Characteristics__Age.csv'))
to_merge = pd.DataFrame()
age_df.columns

Index(['X', 'Y', 'OBJECTID', 'Place', 'Geo_Src', 'e_Pop_Age_Total',
       'm_Pop_Age_Total', 'e_Pop_Age_under_5', 'm_Pop_Age_under_5',
       'e_Pop_Age_5_9', 'm_Pop_Age_5_9', 'e_Pop_Age_10_14', 'm_Pop_Age_10_14',
       'e_Pop_Age_15_19', 'm_Pop_Age_15_19', 'e_Pop_Age_20_24',
       'm_Pop_Age_20_24', 'e_Pop_Age_25_34', 'm_Pop_Age_25_34',
       'e_Pop_Age_35_44', 'm_Pop_Age_35_44', 'e_Pop_Age_45_54',
       'm_Pop_Age_45_54', 'e_Pop_Age_55_59', 'm_Pop_Age_55_59',
       'e_Pop_Age_60_64', 'm_Pop_Age_60_64', 'e_Pop_Age_65_74',
       'm_Pop_Age_65_74', 'e_Pop_Age_75_84', 'm_Pop_Age_75_84',
       'e_Pop_Age_85_plus', 'm_Pop_Age_85_plus', 'e_Pop_Age_Median_age',
       'm_Pop_Age_Median_age', 'e_Pop_Age_under_18', 'm_Pop_Age_under_18',
       'e_Pop_Age_18_plus', 'm_Pop_Age_18_plus', 'e_Pop_Age_21_plus',
       'm_Pop_Age_21_plus', 'e_Pop_Age_62_plus', 'm_Pop_Age_62_plus',
       'e_Pop_Age_65_plus', 'm_Pop_Age_65_plus', 'Start_Year', 'End_Year',
       'PlaceID', 'IsMostRecent'],
   

In [13]:
age_df['e_Pop_Age_18_plus']

0       514606
1         2863
2         4663
3       212674
4        10786
         ...  
4989       126
4990        22
4991       785
4992      1673
4993       440
Name: e_Pop_Age_18_plus, Length: 4994, dtype: int64

In [14]:
to_merge['Place'] = age_df['Place']
to_merge['Year'] = age_df['End_Year']
to_merge['median_age'] = age_df['e_Pop_Age_Median_age']
to_merge['n_pop_under_14'] = age_df['e_Pop_Age_under_5'] + age_df['e_Pop_Age_5_9'] + age_df['e_Pop_Age_10_14']
# For some reason pop_under_18 has only NaN until 2017, but 18_plus and total don't.
# So I compute the population under 18 like this
to_merge['n_pop_under_18'] = age_df['e_Pop_Age_Total'] - age_df['e_Pop_Age_18_plus']
to_merge['n_pop_18-65'] = age_df['e_Pop_Age_18_plus'] - age_df['e_Pop_Age_65_plus']
to_merge['n_pop_65-85'] = age_df['e_Pop_Age_65_plus'] - age_df['e_Pop_Age_85_plus']
to_merge['n_pop_over_85'] = age_df['e_Pop_Age_85_plus']

In [15]:
master_arcgis = to_merge.copy()

In [16]:
# I might drop this in favor of the IRS data, but I feel like creating a complete arcgis file
fam_inc_df = pd.read_csv(os.path.join(arcgis_path, 'ACS_Population_Characteristics__Family_Income_and_Benefits_-_Borough.csv'))
to_merge = pd.DataFrame()
fam_inc_df.columns

Index(['OBJECTID', 'Place', 'Geo_Src', 'e_Families_Total', 'm_Families_Total',
       'e_Family_Income_under_10000', 'm_Family_Income_under_10000',
       'e_Family_Income_10000_14999', 'm_Family_Income_10000_14999',
       'e_Family_Income_15000_24999', 'm_Family_Income_15000_24999',
       'e_Family_Income_25000_34999', 'e_Family_Income_35000_49999',
       'm_Family_Income_35000_49999', 'e_Family_Income_50000_74999',
       'm_Family_Income_50000_74999', 'e_Family_Income_75000_99999',
       'm_Family_Income_75000_99999', 'e_Family_Income_100000_149999',
       'm_Family_Income_100000_149999', 'e_Family_Income_150000_199999',
       'm_Family_Income_150000_199999', 'e_Family_Income_200000_Plus',
       'm_Family_Income_200000_Plus', 'e_Family_Median_Income',
       'm_Family_Median_Income', 'e_Family_Mean_Income',
       'm_Family_Mean_Income', 'Start_Year', 'End_Year', 'PlaceID', 'IsActive',
       'ShapeSTArea', 'ShapeSTLength'],
      dtype='object')

In [17]:
to_merge['Place'] = fam_inc_df['Place']
to_merge['Year'] = fam_inc_df['End_Year']
to_merge['n_families'] = fam_inc_df['e_Families_Total']
to_merge['median_family_income'] = fam_inc_df['m_Family_Median_Income']

In [18]:
master_arcgis = master_arcgis.merge(to_merge, left_on=['Place', 'Year'], right_on=['Place', 'Year'], how='left')

In [19]:
hous_inc_df = pd.read_csv(os.path.join(arcgis_path, 'ACS_Population_Characteristics__Household_Income_and_Benefits.csv'))
to_merge = pd.DataFrame()
hous_inc_df.columns

Index(['X', 'Y', 'OBJECTID', 'Place', 'Geo_Src', 'e_Households_Total',
       'm_Households_Total', 'e_Household_Inc_under_10000',
       'm_Household_Inc_under_10000', 'e_Household_Inc_10000_14999',
       'm_Household_Inc_10000_14999', 'e_Household_Inc_25000_34999',
       'm_Household_Inc_25000_34999', 'e_Household_Inc_35000_49999',
       'm_Household_Inc_35000_49999', 'e_Household_Inc_50000_74999',
       'm_Household_Inc_50000_74999', 'e_Household_Inc_75000_99999',
       'm_Household_Inc_75000_99999', 'e_Household_Inc_100000_149999',
       'm_Household_Inc_100000_149999', 'e_Household_Inc_15000_24999',
       'm_Household_Inc_15000_24999', 'e_Household_Inc_150000_199999',
       'm_Household_Inc_150000_199999', 'e_Household_Inc_200000_Plus',
       'm_Household_Inc_200000_Plus', 'e_Household_Median_Income',
       'm_Household_Median_Income', 'e_Household_Mean_Income',
       'm_Household_Mean_Income', 'e_Household_With_Earnings',
       'm_Household_With_Earnings', 'e_Househol

In [20]:
to_merge['Place'] = hous_inc_df['Place']
to_merge['Year'] = hous_inc_df['End_Year']
to_merge['n_households'] = hous_inc_df['e_Households_Total']

In [21]:
master_arcgis = master_arcgis.merge(to_merge, left_on=['Place', 'Year'], right_on=['Place', 'Year'], how='left')

In [22]:
hous_df = pd.read_csv(os.path.join(arcgis_path, 'ACS_Population_Characteristics__Housing.csv'))
to_merge = pd.DataFrame()
hous_df.columns

Index(['X', 'Y', 'OBJECTID', 'Place', 'Geo_Src', 'e_Total_Units',
       'm_Total_Units', 'e_Total_Occupied_Units', 'm_Total_Occupied_Units',
       'e_Vacant_Units', 'm_Vacant_Units', 'e_Owner_Vacancy_Rate',
       'm_Owner_Vacancy_Rate', 'e_Rental_Vacancy_Rate',
       'm_Rental_Vacancy_Rate', 'Start_Year', 'End_Year', 'PlaceID'],
      dtype='object')

In [23]:
to_merge['Place'] = hous_df['Place']
to_merge['Year'] = hous_df['End_Year']
to_merge['n_houses'] = hous_df['e_Total_Units']
to_merge['n_occupied_houses'] = hous_df['e_Total_Occupied_Units']
to_merge['house_vacancy_rate'] = (hous_df['e_Vacant_Units']) / (hous_df['e_Total_Units']) * 100

In [24]:
master_arcgis = master_arcgis.merge(to_merge, left_on=['Place', 'Year'], right_on=['Place', 'Year'], how='left')

In [25]:
inc_and_pov_df = pd.read_csv(os.path.join(arcgis_path, 'ACS_Population_Characteristics__Income_and_Poverty.csv'))
to_merge = pd.DataFrame()
inc_and_pov_df.columns

Index(['X', 'Y', 'OBJECTID', 'Place', 'Geo_Src', 'e_Per_Capita_Income',
       'm_Per_Capita_Income', 'e_median_Household_Income',
       'm_median_Household_Income', 'e_median_Family_Income',
       'm_median_Family_Income', 'e_Pop_of_Poverty_Det',
       'm_Pop_of_Poverty_Det', 'e_Pop_Below_125pct_Poverty',
       'm_Pop_Below_125pct_Poverty', 'e_Pop_Below_Poverty',
       'm_Pop_Below_Poverty', 'e_Pop_18_64_of_Poverty_Det',
       'm_Pop_18_64_of_Poverty_Det', 'e_Pop_18_64_Below_Poverty',
       'm_Pop_18_64_Below_Poverty', 'e_Pop_65_plus_of_Poverty_Det',
       'm_Pop_65_plus_of_Poverty_Det', 'e_Pop_65_plus_Below_Poverty',
       'm_Pop_65_plus_Below_Poverty', 'e_Pop_under_18_of_Poverty_Det',
       'm_Pop_under_18_of_Poverty_Det', 'e_Pop_under_18_Below_Poverty',
       'm_Pop_under_18_Below_Poverty', 'e_Households_Below_Poverty',
       'm_Households_Below_Poverty', 'Start_Year', 'End_Year', 'PlaceID'],
      dtype='object')

In [26]:
to_merge['Place'] = inc_and_pov_df['Place']
to_merge['Year'] = inc_and_pov_df['End_Year']
to_merge['n_people_below_poverty'] = inc_and_pov_df['e_Pop_Below_Poverty']
to_merge['n_households_below_poverty'] = inc_and_pov_df['e_Households_Below_Poverty']
to_merge['n_people_under_18_below_poverty'] = inc_and_pov_df['e_Pop_under_18_Below_Poverty']
to_merge['n_people_18-64_below_poverty'] = inc_and_pov_df['e_Pop_18_64_Below_Poverty']
to_merge['n_people_65_plus_below_poverty'] = inc_and_pov_df['e_Pop_65_plus_Below_Poverty']

In [27]:
master_arcgis = master_arcgis.merge(to_merge, left_on=['Place', 'Year'], right_on=['Place', 'Year'], how='left')

In [28]:
master_arcgis

Unnamed: 0,Place,Year,median_age,n_pop_under_14,n_pop_under_18,n_pop_18-65,n_pop_65-85,n_pop_over_85,n_families,median_family_income,n_households,n_houses,n_occupied_houses,house_vacancy_rate,n_people_below_poverty,n_households_below_poverty,n_people_under_18_below_poverty,n_people_18-64_below_poverty,n_people_65_plus_below_poverty
0,Alaska,2011,33.8,153256,186097,461925,48440,4241,,,252920,304373,252920,16.904587,,11032.0,,,
1,Aleutians East Borough,2011,39.3,176,221,2750,110,3,204.0,9122.0,336,535,336,37.196262,,21.0,,,
2,Aleutians West Census Area,2011,41.5,666,825,4380,279,4,733.0,11704.0,1255,2268,1255,44.664903,,34.0,,,
3,Anchorage,2011,33.0,61765,74716,191928,19034,1712,,,105123,112804,105123,6.809156,,3804.0,,,
4,Anchorage,2011,33.0,61765,74716,191928,19034,1712,,,105123,112804,105123,6.809156,,3804.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6033,Wrangell,2023,44.3,367,433,1170,481,22,,,912,1300,912,29.846154,,,,,
6034,Yakutat,2023,53.2,52,66,300,130,10,,,207,391,207,47.058824,,,,,
6035,Yakutat,2023,53.2,52,66,300,130,10,,,207,322,207,35.714286,,,,,
6036,Yakutat,2023,53.2,52,66,300,130,10,,,207,391,207,47.058824,,,,,


In [29]:
race_df = pd.read_csv(os.path.join(arcgis_path, 'ACS_Population_Characteristics__Race.csv'))
to_merge = pd.DataFrame()
race_df.columns

Index(['X', 'Y', 'OBJECTID', 'Place', 'Geo_Src', 'e_Total_Pop', 'm_Total_Pop',
       'e_One_Race', 'm_One_Race', 'e_AmInd_AKNat', 'm_AmInd_AKNat', 'e_Asian',
       'm_Asian', 'e_Black_AfricanAm', 'm_Black_AfricanAm',
       'e_NatHI_PacIslander', 'm_NatHI_PacIslander', 'e_Other_Race',
       'm_Other_Race', 'e_White', 'm_White', 'e_Two_or_More', 'm_Two_or_More',
       'e_Black_AfrAm_and_AmInd_AKNat', 'm_Black_AfrAm_and_AmInd_AKNat',
       'e_White_and_AmInd_AKNat', 'm_White_and_AmInd_AKNat',
       'e_White_and_Asian', 'm_White_and_Asian', 'e_White_and_Black_AfricanAm',
       'm_White_and_Black_AfricanAm', 'Start_Year', 'End_Year', 'IsMostRecent',
       'PlaceID'],
      dtype='object')

In [30]:
to_merge['Place'] = race_df['Place']
to_merge['Year'] = race_df['End_Year']
to_merge['n_population_native'] = race_df['e_AmInd_AKNat']
to_merge['n_population_non_native'] = race_df['e_Total_Pop'] - race_df['e_AmInd_AKNat']
to_merge['n_population_white'] = race_df['e_White']
to_merge['n_population_black'] = race_df['e_Black_AfricanAm']
to_merge['n_population_asian'] = race_df['e_Asian']

In [31]:
master_arcgis = master_arcgis.merge(to_merge, left_on=['Place', 'Year'], right_on=['Place', 'Year'], how='left')

In [32]:
sex_df = pd.read_csv(os.path.join(arcgis_path, 'ACS_Population_Characteristics__Sex.csv'))
to_merge = pd.DataFrame()
sex_df.columns

Index(['X', 'Y', 'OBJECTID', 'Place', 'Geo_Src', 'e_Pop_Total', 'm_Pop_Total',
       'e_Pop_Male', 'm_Pop_Male', 'e_Pop_Female', 'm_Pop_Female',
       'Start_Year', 'End_Year', 'PlaceID', 'IsMostRecent'],
      dtype='object')

In [33]:
to_merge['Place'] = sex_df['Place']
to_merge['Year'] = sex_df['End_Year']
to_merge['n_population_male'] = sex_df['e_Pop_Male']
to_merge['n_population_female'] = sex_df['e_Pop_Female']
to_merge['n_total_population(ACS)'] = race_df['e_Total_Pop']

In [34]:
master_arcgis = master_arcgis.merge(to_merge, left_on=['Place', 'Year'], right_on=['Place', 'Year'], how='left')

## Percentage conversion
I think it would be clearer to express most measures as percentages instead of whole numbers.

Even tho I will regress with total population as control, this streamlines the process and allows for less misinterpretations, and comparisons at a glance.

In [35]:
# Population-based percentages
master_arcgis['p_pop_under_14'] = master_arcgis['n_pop_under_14'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_pop_under_18'] = master_arcgis['n_pop_under_18'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_pop_18_65'] = master_arcgis['n_pop_18-65'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_pop_65_85'] = master_arcgis['n_pop_65-85'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_pop_over_85'] = master_arcgis['n_pop_over_85'] / master_arcgis['n_total_population(ACS)'] * 100

# Households and houses
master_arcgis['p_families'] = master_arcgis['n_families'] / master_arcgis['n_households'] * 100
master_arcgis['p_households'] = master_arcgis['n_households'] / master_arcgis['n_houses'] * 100
master_arcgis['p_occupied_houses'] = master_arcgis['n_occupied_houses'] / master_arcgis['n_houses'] * 100

# Poverty measures
master_arcgis['p_people_below_poverty'] = master_arcgis['n_people_below_poverty'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_households_below_poverty'] = master_arcgis['n_households_below_poverty'] / master_arcgis['n_households'] * 100
master_arcgis['p_people_under_18_below_poverty'] = master_arcgis['n_people_under_18_below_poverty'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_people_18_64_below_poverty'] = master_arcgis['n_people_18-64_below_poverty'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_people_65_plus_below_poverty'] = master_arcgis['n_people_65_plus_below_poverty'] / master_arcgis['n_total_population(ACS)'] * 100

# Ethnicity and Race
master_arcgis['p_population_native'] = master_arcgis['n_population_native'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_population_non_native'] = master_arcgis['n_population_non_native'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_population_white'] = master_arcgis['n_population_white'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_population_black'] = master_arcgis['n_population_black'] / master_arcgis['n_total_population(ACS)'] * 100
master_arcgis['p_population_asian'] = master_arcgis['n_population_asian'] / master_arcgis['n_total_population(ACS)'] * 100

# Sex Ratio
master_arcgis['ratio_male_over_female'] = master_arcgis['n_population_male'] / master_arcgis['n_population_female']

# house_vacancy_rate remains unchanged as it is already in percentage,
# but I duplicate it to have all percentage at the end
to_merge['house_vacancy_rate'] = (hous_df['e_Vacant_Units']) / (hous_df['e_Total_Units']) * 100

## Sustinence use communities

In [52]:
substinence_use_df = pd.read_csv(os.path.join(arcgis_path, 'Subsistence_Use_Communities.csv'))

In [41]:
substinence_use_df.columns

Index(['OBJECTID', 'CommunityID', 'CommunityName', 'SubsistenceAreaIndicator',
       'PeriodStartDate', 'PeriodEndDate', 'Year', 'AsOfDate', 'IsMostRecent',
       'created_user', 'created_date', 'last_edited_user', 'last_edited_date',
       'x', 'y', 'GlobalID'],
      dtype='object')

### Check if any community changes status

In [46]:
# Drop NaN and filter for relevant columns
clean_df = substinence_use_df[['CommunityName', 'Year', 'SubsistenceAreaIndicator']].dropna()

# Check if SubsistenceAreaIndicator changes for each community over time
change_check = clean_df.groupby('CommunityName')['SubsistenceAreaIndicator'].nunique().reset_index()

# Communities where the indicator changes over time
changing_communities = change_check[change_check['SubsistenceAreaIndicator'] > 1]
if changing_communities.empty:
    print("No communities change status with time")

No communities change status with time


## Since they don't change only need community and status

In [53]:
substinence_use_df = substinence_use_df[['CommunityName', 'SubsistenceAreaIndicator']]
substinence_use_df.rename(columns={'CommunityName': 'commname', 'SubsistenceAreaIndicator': 'relying_on_substinence'}, inplace=True)
substinence_use_df

Unnamed: 0,commname,relying_on_substinence
0,Fox,NO
1,Skagway,YES
2,Tazlina,YES
3,Lakes,YES
4,Shishmaref,YES
...,...,...
408,Bettles,YES
409,Saint Lawrence Island,YES
410,Eagle Village,YES
411,Seward,NO


## Need to save it on its own and add it in the master for community

In [54]:
substinence_use_df.to_csv(os.path.join(custom_data_directory, 'substinence_use_status_custom.csv'))

# Save the master_arcgis to csv

In [36]:
custom_data_directory = '../../data/custom_data'

# remove duplicates that come up because of carthesian products. Cannot identify which has duplicates,
# but they have the same values for all colmuns so this is the cleanest way I thought of removing them
master_arcgis = master_arcgis.drop_duplicates(subset=['Place', 'Year'])

master_arcgis.to_csv(os.path.join(custom_data_directory, 'arcgis_custom.csv'))