In [1]:
import numpy as np
import pandas as pd

# Pulling HTML Dataframes and Scrubbing

In [2]:
#Use pd.read_html('url') to get your dataframe
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_current_United_States_governors')
df

#This returns many tables
#let's select a single one in particular using indexing that is relative to the webpage

[                                                    0
 0             This article is part of a series on the
 1              State governments of the United States
 2            State constitution Comparison Statehouse
 3                                           Executive
 4   State executives Governor (List) Other common ...
 5                                         Legislative
 6   State representatives (Alabama to Missouri, Mo...
 7                                           Judiciary
 8                     State supreme court State court
 9                Politics portal United States portal
 10  .mw-parser-output .navbar{display:inline;font-...,
    Republican (28)  Democratic (22)                                      \
                               State Image                Governor Party   
 0                           Alabama   NaN                Kay Ivey   NaN   
 1                            Alaska   NaN           Mike Dunleavy   NaN   
 2                           Arizon

In [3]:
#Notice the bracket at the end which refers to the oneth table in the HTML page
#You can play with the index to see which table it returns to get what you want
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_current_United_States_governors')[1]
df

Unnamed: 0_level_0,Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22),Republican (28) Democratic (22)
Unnamed: 0_level_1,State,Image,Governor,Party,Party.1,Born,Prior public experience,Inauguration,End of term,Past governors,Unnamed: 10_level_1
0,Alabama,,Kay Ivey,,Republican,(age 77),"Lieutenant Governor, State Treasurer","April 10, 2017",2023,List,
1,Alaska,,Mike Dunleavy,,Republican,(age 61),Alaska Senate,"December 3, 2018",2022,List,
2,Arizona,,Doug Ducey,,Republican,(age 58),State Treasurer,"January 5, 2015",2023 (term limits),List,
3,Arkansas,,Asa Hutchinson,,Republican,(age 71),"U.S. Under Secretary of Homeland Security, Adm...","January 13, 2015",2023 (term limits),List,
4,California,,Gavin Newsom,,Democratic,(age 54),"Lieutenant Governor, Mayor of San Francisco","January 7, 2019",2023,List,
5,Colorado,,Jared Polis,,Democratic,(age 47),"U.S. House, Colorado State Board of Education","January 8, 2019",2023,List,
6,Connecticut,,Ned Lamont,,Democratic,(age 68),Chair of the State Investment Advisory Council...,"January 9, 2019",2023,List,
7,Delaware,,John Carney,,Democratic,(age 66),"U.S. House, Lieutenant Governor, State Secreta...","January 17, 2017",2025 (term limits),List,
8,Florida,,Ron DeSantis,,Republican,(age 43),"U.S. House, Lieutenant Commander in the United...","January 8, 2019",2023,List,
9,Georgia,,Brian Kemp,,Republican,(age 58),"Secretary of State, Georgia Senate","January 14, 2019",2023,List,


# US Demographics

In [4]:
#So if you "match" when pulling HTML dataframes, you will ONLY return tables containing that column name
#You can further select for which one of those you want, in this case it is the zeroeth one we want
US = pd.read_html('https://en.wikipedia.org/wiki/Demographics_of_the_United_States',
                  match = "Average population")[0]
US

Unnamed: 0.1,Unnamed: 0,Average population[65][29][30],Live births[66],Deaths,Natural change,"Crude birth rate (per 1,000)","Crude death rate (per 1,000)[67]","Natural change (per 1,000)",Total fertility rate[fn 1][68]
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...,...
82,2017[69][70],325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
83,2018[71][72],326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
84,2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
85,2020[73][74],331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


# Find and Replace in Column Names

In [5]:
#Let's replace spaces with underscores in the column names, normally useful if you want to write to csv

US.columns = [x.replace(' ', '_') for x in US.columns]
US

Unnamed: 0,Unnamed:_0,Average_population[65][29][30],Live_births[66],Deaths,Natural_change,"Crude_birth_rate_(per_1,000)","Crude_death_rate_(per_1,000)[67]","Natural_change_(per_1,000)",Total_fertility_rate[fn_1][68]
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...,...
82,2017[69][70],325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
83,2018[71][72],326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
84,2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
85,2020[73][74],331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


# Remove Undesirable Characters in Column Names

In [6]:
#Let's remove all the superfluous parentheses, and brackets since they only hyperlink to references
#This requires careful use of escape sequences 
#It basically says remove the open parenthesis, anything inside, and the closed parentheses 
#& the replace with nothing 
US.columns = US.columns.str.replace(r"\(.*\)", "")
US

  US.columns = US.columns.str.replace(r"\(.*\)", "")


Unnamed: 0,Unnamed:_0,Average_population[65][29][30],Live_births[66],Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_[67],Natural_change_,Total_fertility_rate[fn_1][68]
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...,...
82,2017[69][70],325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
83,2018[71][72],326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
84,2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
85,2020[73][74],331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


In [7]:
#Let's do the same thing with the brackets
US.columns = US.columns.str.replace(r"\[.*\]", "")
US

  US.columns = US.columns.str.replace(r"\[.*\]", "")


Unnamed: 0,Unnamed:_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Total_fertility_rate
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...,...
82,2017[69][70],325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
83,2018[71][72],326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
84,2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
85,2020[73][74],331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


# Renaming Column Names

In [8]:
#'Unnamed:_0' should be "Year" so let's change that

US = US.rename(columns={'Unnamed:_0': 'Year'})
US

Unnamed: 0,Year,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Total_fertility_rate
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...,...
82,2017[69][70],325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
83,2018[71][72],326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
84,2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
85,2020[73][74],331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


# Remove Characters In Columns

In [9]:
#You can reference your column by its name so instead of ".columns" we'll use '.Year'
US.Year = US.Year.str.replace(r"\[.*\]", "")
US

  US.Year = US.Year.str.replace(r"\[.*\]", "")


Unnamed: 0,Year,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Total_fertility_rate
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...,...
82,2017,325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
83,2018,326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
84,2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
85,2020,331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


# Select Columns

In [10]:
#Literally call the column in the dataframe to display it
US.Live_births

0     2377000
1     2355000
2     2413000
3     2496000
4     2466000
       ...   
82    3855500
83    3791712
84    3747540
85    3613647
86    3659289
Name: Live_births, Length: 87, dtype: int64

In [11]:
#This is another way of calling it 
US['Deaths']

0     1392752
1     1479228
2     1450427
3     1381391
4     1387897
       ...   
82    2813503
83    2839205
84    2854858
85    3383729
86    3470428
Name: Deaths, Length: 87, dtype: int64

# Make a Column an Index

In [12]:
#Make the Year column the index

US.set_index('Year', inplace = True)
US

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...
2017,325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
2018,326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
2020,331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


# Grab Data from Multiple Columns

In [13]:
US[['Live_births', 'Deaths']]

Unnamed: 0_level_0,Live_births,Deaths
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1935,2377000,1392752
1936,2355000,1479228
1937,2413000,1450427
1938,2496000,1381391
1939,2466000,1387897
...,...,...
2017,3855500,2813503
2018,3791712,2839205
2019,3747540,2854858
2020,3613647,3383729


# Grab a Row

In [14]:
# To get all the information in a row, specify which one using .loc['Indexofrow']
US.loc['2020']

Average_population      3.314493e+08
Live_births             3.613647e+06
Deaths                  3.383729e+06
Natural_change          2.299180e+05
Crude_birth_rate_       1.090000e+01
Crude_death_rate_       1.020000e+01
Natural_change_         7.000000e-01
Total_fertility_rate    1.641000e+00
Name: 2020, dtype: float64

In [15]:
#Lets get 2020 specifically using the row number, maybe you want to iterate or something:

US.iloc[85]

Average_population      3.314493e+08
Live_births             3.613647e+06
Deaths                  3.383729e+06
Natural_change          2.299180e+05
Crude_birth_rate_       1.090000e+01
Crude_death_rate_       1.020000e+01
Natural_change_         7.000000e-01
Total_fertility_rate    1.641000e+00
Name: 2020, dtype: float64

# Add a Column

In [16]:
US['Pop_Growth'] = US['Live_births'] - US['Deaths']
US

#Notice before we had 8 columns and now we have 9

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Total_fertility_rate,Pop_Growth
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190,984248
1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150,875772
1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170,962573
1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220,1114609
1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170,1078103
...,...,...,...,...,...,...,...,...,...
2017,325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765,1041997
2018,326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729,952507
2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706,892682
2020,331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641,229918


# Delete a Column

In [17]:
US.drop('Pop_Growth', axis = 1, inplace = True)
US
#And now we have 8 columns again

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,2.190
1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,2.150
1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,2.170
1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,2.220
1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,2.170
...,...,...,...,...,...,...,...,...
2017,325719000,3855500,2813503,1041997,11.8,8.7,3.1,1.765
2018,326687000,3791712,2839205,952507,11.6,8.7,2.9,1.729
2019,328240000,3747540,2854858,892682,11.4,8.7,2.7,1.706
2020,331449281,3613647,3383729,229918,10.9,10.2,0.7,1.641


# Delete a Row

In [18]:
US.drop('1935', axis = 0, inplace = True)

# Manipulating Data

In [19]:
#Let's get list of countries by GDP

GDP = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)')[2]
GDP

Unnamed: 0_level_0,Country/Territory,UN Region,IMF[1][12],IMF[1][12],United Nations[13],United Nations[13],World Bank[14][15],World Bank[14][15]
Unnamed: 0_level_1,Country/Territory,UN Region,Estimate,Year,Estimate,Year,Estimate,Year
0,World,-,93863851.0,2021,87461674.0,2020,84705567.0,2020
1,United States,Americas,25346805.0,2022,20893746.0,2020,20936600.0,2020
2,China,Asia,19911593.0,[n 2]2022,14722801.0,[n 3]2020,14722731.0,2020
3,Japan,Asia,4912147.0,2022,5057759.0,2020,4975415.0,2020
4,Germany,Europe,4256540.0,2022,3846414.0,2020,3806060.0,2020
...,...,...,...,...,...,...,...,...
212,Palau,Oceania,244.0,2022,264.0,2020,268.0,2019
213,Kiribati,Oceania,216.0,2022,181.0,2020,200.0,2020
214,Nauru,Oceania,134.0,2022,135.0,2020,118.0,2019
215,Montserrat,Americas,,,68.0,2020,,


In [20]:
#List the column names::

for col in GDP.columns:
    print(col)

('Country/Territory', 'Country/Territory')
('UN Region', 'UN Region')
('IMF[1][12]', 'Estimate')
('IMF[1][12]', 'Year')
('United Nations[13]', 'Estimate')
('United Nations[13]', 'Year')
('World Bank[14][15]', 'Estimate')
('World Bank[14][15]', 'Year')


![GDP.jpg](attachment:GDP.jpg)

In [21]:
#To get rid of that column heading just use droplevel()
GDP.columns = GDP.columns.droplevel()
GDP

#You lose some information, but suppose you don't want it in the first place

Unnamed: 0,Country/Territory,UN Region,Estimate,Year,Estimate.1,Year.1,Estimate.2,Year.2
0,World,-,93863851.0,2021,87461674.0,2020,84705567.0,2020
1,United States,Americas,25346805.0,2022,20893746.0,2020,20936600.0,2020
2,China,Asia,19911593.0,[n 2]2022,14722801.0,[n 3]2020,14722731.0,2020
3,Japan,Asia,4912147.0,2022,5057759.0,2020,4975415.0,2020
4,Germany,Europe,4256540.0,2022,3846414.0,2020,3806060.0,2020
...,...,...,...,...,...,...,...,...
212,Palau,Oceania,244.0,2022,264.0,2020,268.0,2019
213,Kiribati,Oceania,216.0,2022,181.0,2020,200.0,2020
214,Nauru,Oceania,134.0,2022,135.0,2020,118.0,2019
215,Montserrat,Americas,,,68.0,2020,,


In [22]:
#Let's get rid of those duplicate columns

GDP = GDP.loc[:, ~GDP.columns.duplicated()]
GDP

Unnamed: 0,Country/Territory,UN Region,Estimate,Year
0,World,-,93863851.0,2021
1,United States,Americas,25346805.0,2022
2,China,Asia,19911593.0,[n 2]2022
3,Japan,Asia,4912147.0,2022
4,Germany,Europe,4256540.0,2022
...,...,...,...,...
212,Palau,Oceania,244.0,2022
213,Kiribati,Oceania,216.0,2022
214,Nauru,Oceania,134.0,2022
215,Montserrat,Americas,,


In [23]:
#Now lets get rid of any rows that have NAN values, particularly useful if you work with reference tables
#Let's do it for the EStimate volumn and the year column
GDP = GDP[GDP['Estimate'].notna()]
GDP


Unnamed: 0,Country/Territory,UN Region,Estimate,Year
0,World,-,93863851.0,2021
1,United States,Americas,25346805.0,2022
2,China,Asia,19911593.0,[n 2]2022
3,Japan,Asia,4912147.0,2022
4,Germany,Europe,4256540.0,2022
...,...,...,...,...
210,Marshall Islands,Oceania,267.0,2022
212,Palau,Oceania,244.0,2022
213,Kiribati,Oceania,216.0,2022
214,Nauru,Oceania,134.0,2022


In [33]:
#Let's change Country/Territory to just country
GDP.rename(columns={"Country/Territory": "Country", "Estimate":"GDP", "UN Region" : "Region"}, inplace = True)
GDP

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GDP.rename(columns={"Country/Territory": "Country", "Estimate":"GDP", "UN Region" : "Region"}, inplace = True)


Unnamed: 0,Country,Region,GDP,Year
0,World,-,93863851.0,2021
1,United States,Americas,25346805.0,2022
2,China,Asia,19911593.0,2022
3,Japan,Asia,4912147.0,2022
4,Germany,Europe,4256540.0,2022
...,...,...,...,...
210,Marshall Islands,Oceania,267.0,2022
212,Palau,Oceania,244.0,2022
213,Kiribati,Oceania,216.0,2022
214,Nauru,Oceania,134.0,2022


In [34]:
#Getting rid of the reference link brackets once again
GDP.Year = GDP.Year.str.replace(r"\[.*\]", "")

#Let's get rid of the asterisks as well 
GDP.Country = GDP.Country.str.replace("*", "")
GDP


  GDP.Year = GDP.Year.str.replace(r"\[.*\]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GDP.Year = GDP.Year.str.replace(r"\[.*\]", "")
  GDP.Country = GDP.Country.str.replace("*", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GDP.Country = GDP.Country.str.replace("*", "")


Unnamed: 0,Country,Region,GDP,Year
0,World,-,93863851.0,2021
1,United States,Americas,25346805.0,2022
2,China,Asia,19911593.0,2022
3,Japan,Asia,4912147.0,2022
4,Germany,Europe,4256540.0,2022
...,...,...,...,...
210,Marshall Islands,Oceania,267.0,2022
212,Palau,Oceania,244.0,2022
213,Kiribati,Oceania,216.0,2022
214,Nauru,Oceania,134.0,2022


In [35]:
#Suppose you want to find the mean of one column along another column
#Like mean GDP by region:

GDP.groupby("Region").mean()

Unnamed: 0_level_0,GDP
Region,Unnamed: 1_level_1
-,93863850.0
Africa,55056.07
Americas,920759.3
Asia,840056.4
Europe,576225.4
Oceania,146144.1
