In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests
import os

import numpy as np
import csv
from functools import reduce
import json

In [None]:

# Read a CSV file containing sentiment data
cities_solarsent = pd.read_csv("sent_allyears_allcities_v6.csv")
states_solarsent = pd.read_csv("sent_allyears_allstates_v6.csv")

# county - city crosswalk
crosswalk = pd.read_csv("cnty-place-crosswalk-pointbase.csv")

# election 10 years
election = pd.read_csv("election_cnty_2013-2022.csv")
election['cntyfips'] = election['GEOID'].apply(lambda x: '{:05d}'.format(x))
election = election.drop(columns=['GEOID'])

# county population
cnty_pop = pd.read_csv("county_race.csv")
cnty_pop['cntyfips'] = cnty_pop['fips'].apply(lambda x: '{:05d}'.format(x))
cnty_pop = cnty_pop[["cntyfips", "population"]]

# urban rural
urbanrural = pd.read_csv("usda_urbanrural_cnty_avg_2013_2022.csv")
urbanrural['cntyfips'] = urbanrural['cntyfips'].apply(lambda x: '{:05d}'.format(x))

# place census data (B Table + S Table + Density)
placecensus = pd.read_csv("census_full_s_b_2023-2022-v3.csv")

# energy generation state-level data
energygen_states = pd.read_csv("2013-2022-state-energyfromsolar.csv")

# national risk index
nri = pd.read_csv("NRI_Table_Counties_CL.csv")

# temperature, DNI, wind speed
nsrdb = pd.read_csv("df_nsrdb_2013_2022_fordatav7_mastercityover3.csv")

In [None]:
# Convert 'GEOID' to a string and add leading zeros to make it 7 digits
crosswalk["GEOID"] = crosswalk["GEOID"].apply(str).str.zfill(7)

# Create a new column 'GEO_ID' by appending "1600000US" to 'GEOID'
crosswalk["GEO_ID"] = "1600000US" + crosswalk["GEOID"]

# Merge 'cities_solarsent' and 'crosswalk' DataFrames on the 'GEO_ID' column
frames = [cities_solarsent, crosswalk]
cities_solarsent_cntyfips = reduce(lambda left, right: pd.merge(left, right, on=['GEO_ID'], how='inner'), frames)

# Select specific columns from 'cities_solarsent_cntyfips'
cities_solarsent_cntyfips = cities_solarsent_cntyfips[['GEO_ID', 'year', 'sent_score', 'tweet_count', 'user_count',
                                                       'STATEFP', 'PLACEFP','cntyfips', 'CountyName','geometry']]

# Format 'cntyfips' as a 5-digit string
cities_solarsent_cntyfips["cntyfips"] = cities_solarsent_cntyfips["cntyfips"].apply(lambda x: '{:05d}'.format(x))

# Merge 'election', 'nchs_urban', and 'cnty_pop' DataFrames on 'cntyfips'
frames = [election, cnty_pop]
cnty_df = reduce(lambda left, right: pd.merge(left, right, on=['cntyfips'], how='inner'), frames)

# Merge 'cities_solarsent_cntyfips' and 'cnty_df' DataFrames on 'cntyfips' '
frames = [cities_solarsent_cntyfips, urbanrural, cnty_df]
solarsent_cntycov = reduce(lambda left, right: pd.merge(left, right, on=['cntyfips', 'year'], how='inner'), frames)

# Rename the 'population' column to 'cntypop'
solarsent_cntycov.rename(columns={"population": "cntypop"}, inplace=True)

# Merge 'solarsent_cntycov' and 'placecensus' DataFrames on 'GEO_ID' and 'year'
frames = [solarsent_cntycov, placecensus, nsrdb]
solarsent_cntycov_censusb = reduce(lambda left, right: pd.merge(left, right, on=['GEO_ID', 'year'], how='inner'), frames)

#modifying the column names so that the names match with state_solarsent
solarsent_cntycov_censusb.rename(columns = {"sent_score": "Sentiment Score", "STATEFP": "STATE_FIPS"}, inplace=True)

In [None]:
states_solarsent_sub = states_solarsent[["STATE_FIPS", "year", "sent_score"]]

states_solarsent_sub.rename(columns = {"sent_score": "State Sentiment Score"}, inplace=True)

# Merge 'cities_solarsent' and 'crosswalk' DataFrames on the 'GEO_ID' column
frames = [solarsent_cntycov_censusb, states_solarsent_sub, energygen_states]
solarsent_master = reduce(lambda left, right: pd.merge(left, right, on=['STATE_FIPS', "year"], how='inner'), frames)

In [None]:
solarsent_master["per_capita_energygen"] = solarsent_master["EnergyNetGen"] / solarsent_master["Population"] * 100

solarsent_master["ln_per_cap_energygen"] = solarsent_master["per_capita_energygen"].apply(np.log)

solarsent_master["ln_landarea"] = solarsent_master["AREASQM"].apply(np.log)

# Define a dictionary to map gender codes to gender names
cat_dict = {1: "Large Central Metro", 2: "Large Fringe Metro", 3: "Medium Metro",
            4: "Nonmetro", 5: "Nonmetro", 6: "Nonmetro", 7: "Nonmetro",
            8:"Rural", 9: "Rural"}

# Map the gender codes to gender names using the dictionary
solarsent_master["Urban-Rural Continuum"]  = solarsent_master["urbanrural"].map(cat_dict)

# Income variable
solarsent_master["ln_income"] = solarsent_master["MHIncome"].apply(np.log)

solarsent_master["ln_pop"] = solarsent_master["Population"].apply(np.log)

solarsent_master["ln_homevalue"] = solarsent_master['MedianHomeValue'].apply(np.log)

solarsent_master["ln_income_sqrd"] = solarsent_master["ln_income"] **2

solarsent_master["ln_pop_sqrd"] = solarsent_master["ln_pop"] ** 2

solarsent_master["ln_homevalue_sqrd"] = solarsent_master["ln_homevalue"] **2

solarsent_master["MedianHomeValue_sqrd"] = solarsent_master["MedianHomeValue"] **2

solarsent_master["MHIncome_sqrd"] = solarsent_master["MHIncome"] **2

solarsent_master["Population_sqrd"] = solarsent_master["Population"] **2

solarsent_master.loc[solarsent_master['demvotes'] > 0.5, 'political ideology'] = 'Cities: Majority Democratic Voters'
solarsent_master.loc[solarsent_master['demvotes'] <= 0.5, 'political ideology'] = 'Cities: Majority Republican Voters'

solarsent_master["% Moved In"] = solarsent_master["Pr_MovedWithinState"] + solarsent_master["Pr_MovedOutofState"] + solarsent_master["Pr_MovedfromOtherCountry"]

solarsent_master["% Non-Electric Heating"] = solarsent_master['Pr_Heating_Gas'] + solarsent_master['Pr_Heating_Oil'] + solarsent_master['Pr_Heating_Coal'] + solarsent_master['Pr_Heating_Wood']

In [None]:
state_abbr = pd.read_csv("stateabbr.csv")
state_abbr["regions"] = state_abbr["statename"].map(us_state_regions)

solarsent_master = solarsent_master.rename(columns={'STATE_FIPS': 'state'})

In [None]:
frames = [solarsent_master, state_abbr]

master = reduce(lambda left, right: pd.merge(left, right,on=['state'],
                                            how='inner'), frames)

In [None]:
# List of unique regions
unique_regions = master['regions'].unique()

# Create dummy variables for each region
for region in unique_regions:
    master[f'US Region: {region}'] = (master['regions'] == region).astype(int)

In [None]:
master.drop(columns=['US Region: nan'], inplace=True)
master.columns

Index(['GEO_ID', 'year', 'Sentiment Score', 'tweet_count', 'user_count',
       'state', 'PLACEFP', 'cntyfips', 'CountyName', 'geometry', 'urbanrural',
       'demvotes', 'repvotes', 'cntypop', 'NAME', 'Pr_EngLessThanWell',
       'Pr_MovedWithinCounty', 'Pr_MovedWithinState', 'Pr_MovedOutofState',
       'Pr_MovedfromOtherCountry', 'Pr_UnemploymentRateOver16', 'Pr_Multiunit',
       'Pr_LivingAlone', 'Pr_MobileHomes',
       'Pr_AgricultureForestryFishingHuntingMining', 'Pr_Construction',
       'Pr_Manufacturing', 'Pr_Wholesale',
       'Pr_TransportationWarehousingUtilities', 'Pr_Information',
       'Pr_EducationHealthSocialAssistance', 'MHIncome', 'MedianAge',
       'Pr_Citizen', 'Population', 'Pr_White', 'Pr_Black', 'Pr_His', 'Pr_AAPI',
       'Pr_OtherRace', 'Pr_WorkFromHome', 'Pr_DriveAlone', 'Pr_BAorHigher',
       'Pr_Renters', 'Pr_Heating_Gas', 'Pr_Heating_Oil', 'Pr_Heating_Coal',
       'Pr_Heating_Wood', 'Pr_Heating_Solar', 'MedianYearStructureBuilt',
       'MedianNumRoo

In [None]:
newcolname = {
    'GEO_ID': 'GEO_ID',
    'year': 'Year',
    'Sentiment Score': 'Solar Sentiment',
    'urbanrural': 'Rurality',
    'demvotes': '% Dem. Votes',
    'repvotes': '% GOP Votes',
    'Pr_EngLessThanWell': '% English Less Than Well',
    'Pr_MovedWithinCounty': '% Moved within County',
    'Pr_MovedWithinState': '% Moved within State',
    'Pr_MovedOutofState': '% Moved out of State',
    'Pr_MovedfromOtherCountry': '% Moved from Other Country',
    'Pr_UnemploymentRateOver16': '% Unemployment Rate',
    'Pr_Poverty': '% Poverty',
    'Pr_OwnerOccupied': '% Owner Occupied',
    'Pr_Multiunit': '% Multiunit Residents',
    'Pr_LivingAlone': '% Living Alone',
    'Pr_MobileHomes': '% Mobile Homes',
    'Pr_AgricultureForestryFishingHuntingMining': '% Industry: Natural Resources',
    'Pr_Construction': '% Industry: Construction',
    'Pr_Manufacturing': '% Industry: Manufacturing',
    'Pr_Wholesale': '% Industry: Wholesale',
    'Pr_Realestate': '% Industry: Real Estate',
    'Pr_TransportationWarehousingUtilities': '% Industry: Transp/Utilities',
    'Pr_Information': '% Industry: Information',
    'Pr_EducationHealthSocialAssistance': '% Industry: Social Services',
    'Pr_DriveAlone': '% Commute: Drive Alone',
    'Pr_Carpool': '% Commute: Carpool',
    'Pr_PublicTransit': '% Commute: Public Transit',
    'MedianAge': 'Median Age',
    'Pr_BAorHigher': '% Bachelor or Higher',
    'Pr_Disability': '% Ppl. Disability',
    'Pr_White': '% White',
    'Pr_Black': '% African American',
    'Pr_His': '% Latinx/Hispanic',
    'Pr_AAPI': '% AAPI',
    'Pr_OtherRace': '% Other Race',
    'Pr_Renters': '% Renters',
    'Pr_Citizen': '% Citizen',
    'Pr_Heating_Gas': '% Heating: Gas',
    'Pr_Heating_Oil': '% Heating: Oil',
    'Pr_Heating_Coal': '% Heating: Coal',
    'Pr_Heating_Wood': '% Heating: Wood',
    'Pr_Heating_Solar': '% Heating: Solar',
    '% Non-Electric Heating': '% Heating: Non-Elec. Src.',
    'MedianYearStructureBuilt': 'Year Structure Built',
    'MedianNumRooms': 'Median Number of Rooms',
    'MedianHomeValue': 'Home Value',
    'pop_density': 'Pop. Density',
    'State Sentiment Score': 'State Sentiment Score',
    'pr_energy_fromsolar': '% Electricity from Solar',
    'ln_per_cap_energygen': 'Per-Capita Energy Generation (ln)',
    'ln_income': 'Median HH Income (ln)',
    'ln_pop': 'Population (ln)',
    'ln_landarea': 'Land Area (ln)',
    'ln_homevalue': 'Home Value (ln)',
    'ln_income_sqrd': 'Median HH Income (ln) Sqrd',
    'ln_pop_sqrd': 'Population (ln) Sqrd',
    'ln_homevalue_sqrd': 'Home Value (ln) Sqrd',
    'per_capita_energygen': 'Per-Capita Energy Generation',
    'Pr_WorkFromHome': '% Work from Home',
    'DNI': 'Solar Radiation',
    "MedianHomeValue_sqrd": "Home Value Sqrd",
    "MHIncome_sqrd": "Median HH Income Sqrd",
    "Population_sqrd": "Population Sqrd"
}

# Renaming columns
master.rename(columns=newcolname, inplace=True)

In [None]:
master.columns

Index(['GEO_ID', 'Year', 'Solar Sentiment', 'tweet_count', 'user_count',
       'state', 'PLACEFP', 'cntyfips', 'CountyName', 'geometry', 'Rurality',
       '% Dem. Votes', '% GOP Votes', 'cntypop', 'NAME',
       '% English Less Than Well', '% Moved within County',
       '% Moved within State', '% Moved out of State',
       '% Moved from Other Country', '% Unemployment Rate',
       '% Multiunit Residents', '% Living Alone', '% Mobile Homes',
       '% Industry: Natural Resources', '% Industry: Construction',
       '% Industry: Manufacturing', '% Industry: Wholesale',
       '% Industry: Transp/Utilities', '% Industry: Information',
       '% Industry: Social Services', 'MHIncome', 'Median Age', '% Citizen',
       'Population', '% White', '% African American', '% Latinx/Hispanic',
       '% AAPI', '% Other Race', '% Work from Home', '% Commute: Drive Alone',
       '% Bachelor or Higher', '% Renters', '% Heating: Gas', '% Heating: Oil',
       '% Heating: Coal', '% Heating: Wood',