In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import lxml
from urllib.request import Request, urlopen

### URLs Used

In [2]:
WINE_REGION_URL2 = "https://www.tonymappedit.com/top-10-wine-producing-countries/"
WIKI_COUNTRIES = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3"
VINEYARD_URL = "https://www.bkwine.com/features/more/world-wine-production-reaches-record-level-2018-consumption-stable/"
EXPORT_URL = "https://www.nationmaster.com/nmx/ranking/export-of-fortified-wine-or-must"
IMPORT_URL = "https://www.nationmaster.com/nmx/ranking/import-of-fortified-wine-or-must"
CONSUMPTION_URL = "https://www.nationmaster.com/nmx/ranking/wine-consumption"

### Wine Production Data

In [3]:
'''scrape wine region data with coresponding wines'''
req = Request(WINE_REGION_URL2 , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [4]:
country1 = []
countries = soup.find_all("ol")[1]
items = countries.find_all("li")

for country in items:
    country1.append(country.text.split(": "))

In [5]:
country2 = []
countries2 = soup.find_all("ol")[2]
items = countries2.find_all("li")

for country in items:
    country2.append(country.text.split(": "))

In [6]:
country_production = country1 + country2
# country_production

In [7]:
country_production_df = pd.DataFrame(country_production).rename(columns = {0: "Country", 1: "WineProduction[HCL]"}).set_index("Country")
# country_production_df

### Country Code Data

In [8]:
'''scrape wine region data with coresponding wines'''
req = Request(WIKI_COUNTRIES , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [9]:
table = soup.find("div", class_="plainlist")
country_codes = table.find_all("span")
countries = table.find_all("a")

In [10]:
country_list = []
for country in countries:
    country_list.append(country.text)

In [11]:
code_list = []
for code in country_codes:
    code_list.append(code.text)

In [12]:
#Province of China was index 229. 
country_df = pd.DataFrame(country_list).drop([11, 229]).reset_index().drop(columns = ["index"])
country_df = country_df.rename(columns = {0: "Country"}).reset_index().set_index("index")

In [13]:
code_df = pd.DataFrame(code_list).drop([11]).reset_index().drop(columns = ["index"])
code_df = code_df.rename(columns = {0: "CODES"}).reset_index().set_index("index")

In [14]:
country_code_df = pd.merge(country_df, code_df, on="index", how="outer").set_index("Country").rename(index = {"United States of America" : "United States"})
country_code_df

Unnamed: 0_level_0,CODES
Country,Unnamed: 1_level_1
Aruba,ABW
Afghanistan,AFG
Angola,AGO
Anguilla,AIA
Åland Islands,ALA
...,...
Samoa,WSM
Yemen,YEM
South Africa,ZAF
Zambia,ZMB


In [15]:
wine_production_HCL = pd.merge(country_production_df,country_code_df,  how="right", on="Country").rename(columns = {'Alpha-3 code':'CODE'}).fillna(0)
# wine_production_HCL

In [16]:
# wine_production_HCL.to_csv("wine_production.csv")

### Vineyard Data

In [17]:
'''scrape wine region data with coresponding wines'''
req = Request(VINEYARD_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [18]:
vineyard_table = soup.find_all("table", id="tablepress-762")[0]
vine_country = vineyard_table.find_all("td", class_="column-2")
vine_acreage = vineyard_table.find_all("td", class_="column-3")

In [19]:
vine_countries = []
for country in vine_country:
    vine_countries.append(country.text)

In [20]:
plot_sizes = []
for plot_size in vine_acreage:
    plot_sizes.append(plot_size.text)

In [21]:
country_vy = pd.DataFrame(vine_countries).drop([0,27,28,29,30]).reset_index().replace('\*','',regex=True).rename(columns = {0: "Country"})
size_vy = pd.DataFrame(plot_sizes).drop([0,27,28]).reset_index().rename(columns = {0: "Largest Vineyards"})

In [22]:
vineyard_data = pd.merge(country_vy, size_vy, on="index").drop(columns = ["index"]).set_index("Country").rename(index = {"USA" : "United States"})
vineyard_data

Unnamed: 0_level_0,Largest Vineyards
Country,Unnamed: 1_level_1
Spain,969
China,875
France,789
Italy,702
Turkey,448
United States,430
Argentina,219
Chile,212
Portugal,192
Romania,191


In [23]:
world_wine_vy = pd.merge(wine_production_HCL, vineyard_data, on="Country", how="left").fillna(0)
world_wine_vy

Unnamed: 0_level_0,WineProduction[HCL],CODES,Largest Vineyards
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,54.8,ITA,702
France,49.1,FRA,789
Spain,44.4,ESP,969
United States,23.9,USA,430
Argentina,14.5,ARG,219
...,...,...,...
Wallis and Futuna,0,WLF,0
Samoa,0,WSM,0
Yemen,0,YEM,0
Zambia,0,ZMB,0


### Export Data

In [44]:
export_table = pd.read_html(EXPORT_URL, header = 0)[0]
export_df = pd.DataFrame(export_table)

In [45]:
export_df = export_df.replace('#...','',regex=True)\
    .rename(columns = {"130 Countries" : "Country", "US Dollars" : "Export"})\
    .set_index("Country")\
    .drop(columns = ["#", "Last", "YoY", "5âyears CAGR", "Unnamed: 6"])

In [46]:
export_df.Export.apply(int).to_frame()
for c in export_df:
    export_df[c] = export_df[c].apply(lambda x: f'$ {x:,}')

In [47]:
export_df

Unnamed: 0_level_0,Export
Country,Unnamed: 1_level_1
France,"$ 5,804,650,668.68"
Italy,"$ 4,771,361,172.94"
Spain,"$ 2,054,743,135.27"
Chile,"$ 1,646,745,073.75"
Australia,"$ 1,310,510,285.96"
...,...
Tanzania,"$ 1,273.0"
Ecuador,"$ 1,106.0"
Sao Tome and Principe,$ 179.0
Bahamas,$ 50.0


### Import Data

In [28]:
import_table = pd.read_html(IMPORT_URL, header = 0)[0]
import_df = pd.DataFrame(import_table)

In [29]:
import_df = import_df.replace('#...','',regex=True)\
    .rename(columns = {"156 Countries" : "Country", "US Dollars" : "Import"})\
    .set_index("Country")\
    .drop(columns = ["#", "Last", "YoY", "5âyears CAGR", "Unnamed: 6"])

In [30]:
import_df.Import.apply(int).to_frame()
for c in import_df:
    import_df[c] = import_df[c].apply(lambda x: f'$ {x:,}')

In [31]:
import_df

Unnamed: 0_level_0,Import
Country,Unnamed: 1_level_1
United States,"$ 4,646,536,989.52"
United Kingdom,"$ 2,599,546,724.13"
China,"$ 2,374,025,494.8"
Germany,"$ 1,809,853,200.33"
Canada,"$ 1,657,432,367.71"
...,...
Palestine,"$ 13,599.0"
Brunei,"$ 13,244.0"
Montserrat,"$ 7,328.0"
Gambia,"$ 2,045.0"


In [41]:
export_import = pd.merge(export_df, import_df, on="Country", how="right").fillna(0)
export_import.tail(75)

Unnamed: 0_level_0,Export,Import
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Canada,0,"$ 1,657,432,367.71"
Hong Kong,0,"$ 1,562,477,410.65"
Netherlands,0,"$ 1,194,931,736.16"
Japan,0,"$ 985,033,235.85"
Switzerland,0,"$ 864,079,471.18"
...,...,...
Palestine,0,"$ 13,599.0"
Brunei,0,"$ 13,244.0"
Montserrat,0,"$ 7,328.0"
Gambia,0,"$ 2,045.0"


### Consumption Data

In [34]:
consumption_table = pd.read_html(CONSUMPTION_URL, header = 0)[0]
consumption_df_unclean = pd.DataFrame(consumption_table)

In [35]:
consumption_df = consumption_df_unclean.replace('#..','',regex=True)\
    .rename(columns = {"19 Countries" : "Country", "Thousand Hectoliters" : "Hectoliters (Millions)"})\
    .set_index("Country")\
    .drop(columns = ["#", "Last", "YoY", "5âyears CAGR", "Unnamed: 6"])
consumption_df

Unnamed: 0_level_0,Hectoliters (Millions)
Country,Unnamed: 1_level_1
United States,32597.0
France,26196.0
Italy,21209.0
Germany,20356.0
China,18776.0
United Kingdom,13045.0
Spain,9445.0
Argentina,9351.0
Russia,9202.0
Australia,5511.0


In [64]:
consumption_df["Hectoliters (Millions)"] = consumption_df["Hectoliters (Millions)"]/1000

Hectoliters (Millions)    object
dtype: object

In [39]:
world_wine_vy.to_csv("world_wine_data.csv")