In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import re

Cargamos la información sobre PBI (GDP en inglés):

https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_GDP

In [2]:
# URL de la página de Wikipedia con la tabla
url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_GDP'

# Realiza la solicitud HTTP
response = requests.get(url)

# Verifica que la solicitud fue exitosa
if response.status_code == 200:
    # Parsea el contenido HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Encuentra la tabla por su clase CSS
    tabla = soup.find('table', {'class': 'wikitable'})
    
    # Utiliza StringIO para leer la tabla sin generar la advertencia
    df = pd.read_html(StringIO(str(tabla)))[0]

In [9]:
# Filtrado y modificaciones del DataFrame:
states_gdp = df["Nominal GDP at current prices 2022 (millions of U.S. dollars)[1]"].copy()
states_gdp.loc[:, "Nominal GDP per capita"] = df["Nominal GDP per capita 2022[1][3]"]["2022"].copy()
states_gdp.loc[:, "% of national GDP"] = df["% of national[1]"]["2022"].copy()
states_gdp.loc[:, "State"] = df["State or federal district"]["State or federal district"].copy()
states_gdp.loc[:, "Nominal GDP at current prices 2022 (millions of U.S. dollars)"] = states_gdp["2022"].fillna(0).astype(int)
states_gdp = states_gdp.drop(columns=["2023", "2022"]).drop(states_gdp.index[0]).reset_index(drop=True)
states_gdp.loc[:, "State"] = states_gdp["State"].apply(lambda x: re.sub(r'\s*\*', '', x)).str.strip()
states_gdp = states_gdp[["State", "Nominal GDP at current prices 2022 (millions of U.S. dollars)", "Nominal GDP per capita", "% of national GDP"]]

In [10]:
states_gdp.head()

Unnamed: 0,State,Nominal GDP at current prices 2022 (millions of U.S. dollars),Nominal GDP per capita,% of national GDP
0,California,3598103,"$92,190",14.69%
1,Texas,2355960,"$78,456",8.69%
2,New York,2053180,"$104,344",8.11%
3,Florida,1389070,"$62,446",5.37%
4,Illinois,1033310,"$82,126",4.11%


Cargamos la información sobre población:

https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population

In [12]:
# URL de la página de Wikipedia con la tabla
url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population'

# Realiza la solicitud HTTP
response = requests.get(url)

# Verifica que la solicitud fue exitosa
if response.status_code == 200:
    # Parsea el contenido HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Encuentra la tabla por su clase CSS
    tabla = soup.find('table', {'class': 'wikitable'})
    
    # Utiliza StringIO para leer la tabla sin generar la advertencia
    population = pd.read_html(StringIO(str(tabla)))[0]

In [21]:
# Filtrado y modificaciones del DataFrame:
states_ppl = population["Census population[8][a]"]
states_ppl["State"] = population["State or territory"]
states_ppl["Population April 1, 2020"] = states_ppl["April 1, 2020"]
states_ppl = states_ppl.drop(columns=["July 1, 2023 (est.)", "April 1, 2020"]).drop(states_ppl.index[0]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  states_ppl["State"] = population["State or territory"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  states_ppl["Population April 1, 2020"] = states_ppl["April 1, 2020"]


In [17]:
states_ppl.head()

Unnamed: 0,State,"Population April 1, 2020"
0,California,39538223.0
1,Texas,29145505.0
2,Florida,21538187.0
3,New York,20201249.0
4,Pennsylvania,13002700.0


Unificamos ambas tablas:

In [18]:
socioeconomic_data = pd.merge(states_gdp, states_ppl, on="State", how="left")

In [19]:
socioeconomic_data.head()

Unnamed: 0,State,Nominal GDP at current prices 2022 (millions of U.S. dollars),Nominal GDP per capita,% of national GDP,"Population April 1, 2020"
0,California,3598103,"$92,190",14.69%,39538223.0
1,Texas,2355960,"$78,456",8.69%,29145505.0
2,New York,2053180,"$104,344",8.11%,20201249.0
3,Florida,1389070,"$62,446",5.37%,21538187.0
4,Illinois,1033310,"$82,126",4.11%,12812508.0


In [22]:
socioeconomic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 5 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   State                                                          52 non-null     object 
 1   Nominal GDP at current prices 2022 (millions of U.S. dollars)  52 non-null     int32  
 2   Nominal GDP per capita                                         52 non-null     object 
 3   % of national GDP                                              52 non-null     object 
 4   Population April 1, 2020                                       51 non-null     float64
dtypes: float64(1), int32(1), object(3)
memory usage: 2.0+ KB


In [33]:
socioeconomic_data = socioeconomic_data.rename(columns={'Nominal GDP at current prices 2022 (millions of U.S. dollars)': 'Nominal_GDP_2022_Millions'})

In [34]:
# Se limpia el dataset
socioeconomic_data["Nominal GDP per capita"] = socioeconomic_data["Nominal GDP per capita"].replace('[\\$,]', '', regex=True).astype(float)
socioeconomic_data["% of national GDP"] = socioeconomic_data["% of national GDP"].replace('%', '', regex=True).astype(float)
socioeconomic_data.head(1)

Unnamed: 0,State,Nominal_GDP_2022_Millions,Nominal GDP per capita,% of national GDP,"Population April 1, 2020"
0,California,3598103,92190.0,14.69,39538223.0


In [32]:
# Guardado como parquet:
socioeconomic_data.to_parquet('socioeconomic_data.parquet')

In [None]:
# Guardado como:
socioeconomic_data.to_csv('socioeconomic_data.csv')