In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import lxml
from urllib.request import Request, urlopen
import numpy as np

### URLs Used

In [2]:
WINE_REGION_URL2 = "https://www.tonymappedit.com/top-10-wine-producing-countries/"
WIKI_COUNTRIES = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3"
VINEYARD_URL = "https://www.bkwine.com/features/more/world-wine-production-reaches-record-level-2018-consumption-stable/"
EXPORT_URL = "https://www.nationmaster.com/nmx/ranking/export-of-fortified-wine-or-must"
IMPORT_URL = "https://www.nationmaster.com/nmx/ranking/import-of-fortified-wine-or-must"
CONSUMPTION_URL = "https://www.nationmaster.com/nmx/ranking/wine-consumption"

### Wine Production Data

In [3]:
'''scrape wine region data with coresponding wines'''
req = Request(WINE_REGION_URL2 , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [4]:
country1 = []
countries = soup.find_all("ol")[1]
items = countries.find_all("li")

for country in items:
    country1.append(country.text.split(": "))

In [5]:
country2 = []
countries2 = soup.find_all("ol")[2]
items = countries2.find_all("li")

for country in items:
    country2.append(country.text.split(": "))

In [6]:
country_production = country1 + country2

In [7]:
country_production_df = pd.DataFrame(country_production).rename(columns = {0: "Country", 1: "Wine Production"}).set_index("Country")
country_production_df.astype("float64").head()

Unnamed: 0_level_0,Wine Production
Country,Unnamed: 1_level_1
Italy,54.8
France,49.1
Spain,44.4
United States,23.9
Argentina,14.5


### Country Code Data

In [8]:
'''scrape wine region data with coresponding wines'''
req = Request(WIKI_COUNTRIES , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [9]:
table = soup.find("div", class_="plainlist")
country_codes = table.find_all("span")
countries = table.find_all("a")

In [10]:
country_list = []
for country in countries:
    country_list.append(country.text)

In [11]:
code_list = []
for code in country_codes:
    code_list.append(code.text)

In [12]:
#Province of China was index 229. 
country_df = pd.DataFrame(country_list).drop([229]).reset_index().drop(columns = ["index"])
country_df = country_df.rename(columns = {0: "Country"}).reset_index().set_index("index")

In [13]:
code_df = pd.DataFrame(code_list).reset_index().drop(columns = ["index"])
code_df = code_df.rename(columns = {0: "CODES"}).reset_index().set_index("index")

In [14]:
country_code_df = pd.merge(country_df, code_df, on="index", how="outer").set_index("Country").rename(index = {"United States of America" : "United States"})
country_code_df.head()

Unnamed: 0_level_0,CODES
Country,Unnamed: 1_level_1
Aruba,ABW
Afghanistan,AFG
Angola,AGO
Anguilla,AIA
Åland Islands,ALA


In [15]:
wine_production = pd.merge(country_production_df,country_code_df,  how="right", on="Country").rename(columns = {'Alpha-3 code':'CODE'}).fillna(0)
wine_production.head()

Unnamed: 0_level_0,Wine Production,CODES
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Italy,54.8,ITA
France,49.1,FRA
Spain,44.4,ESP
United States,23.9,USA
Argentina,14.5,ARG


### Vineyard Data

In [16]:
'''scrape wine region data with coresponding wines'''
req = Request(VINEYARD_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [17]:
vineyard_table = soup.find_all("table", id="tablepress-762")[0]
vine_country = vineyard_table.find_all("td", class_="column-2")
vine_acreage = vineyard_table.find_all("td", class_="column-3")

In [18]:
vine_countries = []
for country in vine_country:
    vine_countries.append(country.text)

In [19]:
plot_sizes = []
for plot_size in vine_acreage:
    plot_sizes.append(plot_size.text)

In [20]:
country_vy = pd.DataFrame(vine_countries).drop([0,27,28,29,30]).reset_index().replace('\*','',regex=True).rename(columns = {0: "Country"})
size_vy = pd.DataFrame(plot_sizes).drop([0,27,28]).reset_index().rename(columns = {0: "Largest Vineyards"})

In [21]:
vineyard_data = pd.merge(country_vy, size_vy, on="index").drop(columns = ["index"]).set_index("Country").rename(index = {"USA" : "United States"})
vineyard_data.head()

Unnamed: 0_level_0,Largest Vineyards
Country,Unnamed: 1_level_1
Spain,969
China,875
France,789
Italy,702
Turkey,448


In [22]:
world_wine_vy = pd.merge(wine_production, vineyard_data, on="Country", how="left").fillna(0)
world_wine_vy.head()

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,54.8,ITA,702
France,49.1,FRA,789
Spain,44.4,ESP,969
United States,23.9,USA,430
Argentina,14.5,ARG,219


### Export Data

In [23]:
'''scrape wine region data with coresponding wines'''
req = Request(EXPORT_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [24]:
export_tables = soup.find_all("div", class_="country-name")

In [25]:
export_country = []
for export in export_tables:
    export_country.append(export.text)

In [26]:
export_country_df = pd.Series(export_country).str.strip().rename_axis("index").rename("Country")
export_country_df.to_frame().head()

Unnamed: 0_level_0,Country
index,Unnamed: 1_level_1
0,France
1,Italy
2,Spain
3,Chile
4,Australia


In [27]:
export_values = soup.find_all("td", class_="last-value")

In [28]:
export_string = []
for export in export_values:
    export_string.append(export.text)

In [29]:
export_values = []
for e in export_string:
    export_values.append(e.strip().replace(",",""))

In [114]:
export_value_df = pd.Series(export_values).str.strip().rename_axis("index").rename("Exports_Values")
export_value_df = export_value_df.to_frame()

In [115]:
export_df = pd.merge(export_country_df,export_value_df, on="index").set_index("Country")
export_df = export_df.astype(float)
export_df["Exports"] = np.log(export_df["Exports_Values"])
export_df.head()

Unnamed: 0_level_0,Exports_Values,Exports
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
France,5804651000.0,22.481925
Italy,4771361000.0,22.285897
Spain,2054743000.0,21.443417
Chile,1646745000.0,21.222066
Australia,1310510000.0,20.993682


In [116]:
world_wine_export = pd.merge(world_wine_vy, export_df, on="Country", how="left")
world_wine_export.head()

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports_Values,Exports
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,54.8,ITA,702,4771361000.0,22.285897
France,49.1,FRA,789,5804651000.0,22.481925
Spain,44.4,ESP,969,2054743000.0,21.443417
United States,23.9,USA,430,1288961000.0,20.977102
Argentina,14.5,ARG,219,770027800.0,20.461937


### Import Data

In [33]:
'''scrape wine region data with coresponding wines'''
req = Request(IMPORT_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [34]:
import_tables = soup.find_all("div", class_="country-name")

In [35]:
import_country = []
for unit in import_tables:
    import_country.append(unit.text)

In [36]:
import_country_df = pd.Series(import_country).str.strip().rename_axis("index").rename("Country")
import_country_df.to_frame().head()

Unnamed: 0_level_0,Country
index,Unnamed: 1_level_1
0,United States
1,United Kingdom
2,China
3,Germany
4,Canada


In [37]:
import_values = soup.find_all("td", class_="last-value")

In [38]:
import_string = []
for unit in import_values:
    import_string.append(unit.text)

In [57]:
import_values = []
for unit in import_string:
    import_values.append(unit.strip().replace(",",""))

In [117]:
import_value_df = pd.Series(import_values).str.strip().rename_axis("index").rename("Imports_Values")
import_value_df.to_frame().head()

Unnamed: 0_level_0,Imports_Values
index,Unnamed: 1_level_1
0,4646536989.52
1,2599546724.13
2,2374025494.8
3,1809853200.33
4,1657432367.71


In [118]:
import_df = pd.merge(import_country_df,import_value_df, on="index").set_index("Country")
import_df = import_df.astype(float)
import_df["Imports"] = np.log(import_df["Imports_Values"])
import_df.head()

Unnamed: 0_level_0,Imports_Values,Imports
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,4646537000.0,22.259388
United Kingdom,2599547000.0,21.678603
China,2374025000.0,21.587853
Germany,1809853000.0,21.316512
Canada,1657432000.0,21.228535


In [119]:
world_wine_import = pd.merge(world_wine_export, import_df, on="Country", how="left")
world_wine_import.head()

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports_Values,Exports,Imports_Values,Imports
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Italy,54.8,ITA,702,4771361000.0,22.285897,68575680.0,18.043449
France,49.1,FRA,789,5804651000.0,22.481925,462240200.0,19.951595
Spain,44.4,ESP,969,2054743000.0,21.443417,84317180.0,18.250096
United States,23.9,USA,430,1288961000.0,20.977102,4646537000.0,22.259388
Argentina,14.5,ARG,219,770027800.0,20.461937,2563509.0,14.756888


### Consumption Data

In [120]:
'''scrape wine region data with coresponding wines'''
req = Request(CONSUMPTION_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [121]:
consumption_tables = soup.find_all("div", class_="country-name")

In [122]:
consumption_country = []
for consumption in consumption_tables:
    consumption_country.append(consumption.text)

In [123]:
consumption_country_df = pd.Series(consumption_country).str.strip().rename_axis("index").rename("Country")
consumption_country_df.to_frame().head()

Unnamed: 0_level_0,Country
index,Unnamed: 1_level_1
0,United States
1,France
2,Italy
3,Germany
4,China


In [124]:
consumption_values = soup.find_all("td", class_="last-value")

In [125]:
consumption_string = []
for consumption in consumption_values:
    consumption_string.append(consumption.text)

In [126]:
consumption_value = []
for consumption in consumption_string:
    consumption_value.append(consumption.strip().replace(",",""))

In [127]:
consumption_value_df = pd.Series(consumption_value).str.strip().rename_axis("index").rename("Consumption")
consumption_value_df.to_frame().head()

Unnamed: 0_level_0,Consumption
index,Unnamed: 1_level_1
0,32597.0
1,26196.0
2,21209.0
3,20356.0
4,18776.0


In [128]:
consumption_df = pd.merge(consumption_country_df,consumption_value_df, on="index").set_index("Country")
consumption_df = consumption_df.astype(float)/1000

In [133]:
world_wine_data = pd.merge(world_wine_import, consumption_df, on="Country", how="left").fillna(0)
world_wine_data.head()

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports_Values,Exports,Imports_Values,Imports,Consumption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Italy,54.8,ITA,702,4771361000.0,22.285897,68575680.0,18.043449,21.209
France,49.1,FRA,789,5804651000.0,22.481925,462240200.0,19.951595,26.196
Spain,44.4,ESP,969,2054743000.0,21.443417,84317180.0,18.250096,9.445
United States,23.9,USA,430,1288961000.0,20.977102,4646537000.0,22.259388,32.597
Argentina,14.5,ARG,219,770027800.0,20.461937,2563509.0,14.756888,9.351


In [134]:
world_wine_data['Exports_Values']  = world_wine_data['Exports_Values'].astype(object).map('${:,.2f}'.format)
world_wine_data['Imports_Values']  = world_wine_data['Imports_Values'].astype(object).map('${:,.2f}'.format)

In [135]:
world_wine_data

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports_Values,Exports,Imports_Values,Imports,Consumption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Italy,54.8,ITA,702,"$4,771,361,172.94",22.285897,"$68,575,684.80",18.043449,21.209
France,49.1,FRA,789,"$5,804,650,668.68",22.481925,"$462,240,240.98",19.951595,26.196
Spain,44.4,ESP,969,"$2,054,743,135.27",21.443417,"$84,317,181.86",18.250096,9.445
United States,23.9,USA,430,"$1,288,960,997.30",20.977102,"$4,646,536,989.52",22.259388,32.597
Argentina,14.5,ARG,219,"$770,027,827.77",20.461937,"$2,563,509.00",14.756888,9.351
...,...,...,...,...,...,...,...,...
Wallis and Futuna,0,WLF,0,$0.00,0.000000,$0.00,0.000000,0.000
Samoa,0,WSM,0,"$8,868.00",9.090205,"$15,400.14",9.642132,0.000
Yemen,0,YEM,0,$0.00,0.000000,$0.00,0.000000,0.000
Zambia,0,ZMB,0,"$289,584.00",12.576201,"$2,519,971.00",14.739758,0.000


In [136]:
world_wine_data.to_csv("world_wine_data.csv")