In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import lxml
from urllib.request import Request, urlopen

### URLs Used

In [2]:
WINE_REGION_URL2 = "https://www.tonymappedit.com/top-10-wine-producing-countries/"
WIKI_COUNTRIES = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3"
VINEYARD_URL = "https://www.bkwine.com/features/more/world-wine-production-reaches-record-level-2018-consumption-stable/"
EXPORT_URL = "https://www.nationmaster.com/nmx/ranking/export-of-fortified-wine-or-must"
IMPORT_URL = "https://www.nationmaster.com/nmx/ranking/import-of-fortified-wine-or-must"
CONSUMPTION_URL = "https://www.nationmaster.com/nmx/ranking/wine-consumption"

### Wine Production Data

In [3]:
'''scrape wine region data with coresponding wines'''
req = Request(WINE_REGION_URL2 , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [4]:
country1 = []
countries = soup.find_all("ol")[1]
items = countries.find_all("li")

for country in items:
    country1.append(country.text.split(": "))

In [5]:
country2 = []
countries2 = soup.find_all("ol")[2]
items = countries2.find_all("li")

for country in items:
    country2.append(country.text.split(": "))

In [6]:
country_production = country1 + country2
# country_production

In [7]:
country_production_df = pd.DataFrame(country_production).rename(columns = {0: "Country", 1: "Wine Production"}).set_index("Country")
country_production_df.astype("float64")

Unnamed: 0_level_0,Wine Production
Country,Unnamed: 1_level_1
Italy,54.8
France,49.1
Spain,44.4
United States,23.9
Argentina,14.5
Chile,12.9
Australia,12.9
Germany,9.8
South Africa,9.5
China,9.3


### Country Code Data

In [8]:
'''scrape wine region data with coresponding wines'''
req = Request(WIKI_COUNTRIES , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [9]:
table = soup.find("div", class_="plainlist")
country_codes = table.find_all("span")
countries = table.find_all("a")

In [10]:
country_list = []
for country in countries:
    country_list.append(country.text)

In [11]:
code_list = []
for code in country_codes:
    code_list.append(code.text)

In [12]:
#Province of China was index 229. 
country_df = pd.DataFrame(country_list).drop([11, 229]).reset_index().drop(columns = ["index"])
country_df = country_df.rename(columns = {0: "Country"}).reset_index().set_index("index")

In [13]:
code_df = pd.DataFrame(code_list).drop([11]).reset_index().drop(columns = ["index"])
code_df = code_df.rename(columns = {0: "CODES"}).reset_index().set_index("index")

In [14]:
country_code_df = pd.merge(country_df, code_df, on="index", how="outer").set_index("Country").rename(index = {"United States of America" : "United States"})
country_code_df

Unnamed: 0_level_0,CODES
Country,Unnamed: 1_level_1
Aruba,ABW
Afghanistan,AFG
Angola,AGO
Anguilla,AIA
Åland Islands,ALA
...,...
Samoa,WSM
Yemen,YEM
South Africa,ZAF
Zambia,ZMB


In [15]:
wine_production = pd.merge(country_production_df,country_code_df,  how="right", on="Country").rename(columns = {'Alpha-3 code':'CODE'}).fillna(0)
wine_production

Unnamed: 0_level_0,Wine Production,CODES
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Italy,54.8,ITA
France,49.1,FRA
Spain,44.4,ESP
United States,23.9,USA
Argentina,14.5,ARG
...,...,...
Wallis and Futuna,0,WLF
Samoa,0,WSM
Yemen,0,YEM
Zambia,0,ZMB


### Vineyard Data

In [16]:
'''scrape wine region data with coresponding wines'''
req = Request(VINEYARD_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [17]:
vineyard_table = soup.find_all("table", id="tablepress-762")[0]
vine_country = vineyard_table.find_all("td", class_="column-2")
vine_acreage = vineyard_table.find_all("td", class_="column-3")

In [18]:
vine_countries = []
for country in vine_country:
    vine_countries.append(country.text)

In [19]:
plot_sizes = []
for plot_size in vine_acreage:
    plot_sizes.append(plot_size.text)

In [20]:
country_vy = pd.DataFrame(vine_countries).drop([0,27,28,29,30]).reset_index().replace('\*','',regex=True).rename(columns = {0: "Country"})
size_vy = pd.DataFrame(plot_sizes).drop([0,27,28]).reset_index().rename(columns = {0: "Largest Vineyards"})

In [21]:
vineyard_data = pd.merge(country_vy, size_vy, on="index").drop(columns = ["index"]).set_index("Country").rename(index = {"USA" : "United States"})
vineyard_data

Unnamed: 0_level_0,Largest Vineyards
Country,Unnamed: 1_level_1
Spain,969
China,875
France,789
Italy,702
Turkey,448
United States,430
Argentina,219
Chile,212
Portugal,192
Romania,191


In [22]:
world_wine_vy = pd.merge(wine_production, vineyard_data, on="Country", how="left").fillna(0)
world_wine_vy

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,54.8,ITA,702
France,49.1,FRA,789
Spain,44.4,ESP,969
United States,23.9,USA,430
Argentina,14.5,ARG,219
...,...,...,...
Wallis and Futuna,0,WLF,0
Samoa,0,WSM,0
Yemen,0,YEM,0
Zambia,0,ZMB,0


### Export Data

In [23]:
'''scrape wine region data with coresponding wines'''
req = Request(EXPORT_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [24]:
export_tables = soup.find_all("div", class_="country-name")

In [25]:
export_country = []
for export in export_tables:
    export_country.append(export.text)

In [26]:
export_country_df = pd.Series(export_country).str.strip().rename_axis("index").rename("Country")
export_country_df.to_frame()

Unnamed: 0_level_0,Country
index,Unnamed: 1_level_1
0,France
1,Italy
2,Spain
3,Chile
4,Australia
...,...
124,Tanzania
125,Ecuador
126,Sao Tome and Principe
127,Bahamas


In [27]:
export_values = soup.find_all("td", class_="last-value")

In [28]:
export_string = []
for export in export_values:
    export_string.append(export.text)

In [29]:
export_values = []
for e in export_string:
    export_values.append(e.strip().replace(",",""))

In [30]:
export_value_df = pd.Series(export_values).str.strip().rename_axis("index").rename("Exports")
export_value_df = export_value_df.to_frame()

In [31]:
export_df = pd.merge(export_country_df,export_value_df, on="index").set_index("Country")
export_df

Unnamed: 0_level_0,Exports
Country,Unnamed: 1_level_1
France,5804650668.68
Italy,4771361172.94
Spain,2054743135.27
Chile,1646745073.75
Australia,1310510285.96
...,...
Tanzania,1273.00
Ecuador,1106.00
Sao Tome and Principe,179.00
Bahamas,50.00


In [32]:
world_wine_export = pd.merge(world_wine_vy, export_df, on="Country", how="left")
world_wine_export

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Italy,54.8,ITA,702,4771361172.94
France,49.1,FRA,789,5804650668.68
Spain,44.4,ESP,969,2054743135.27
United States,23.9,USA,430,1288960997.30
Argentina,14.5,ARG,219,770027827.77
...,...,...,...,...
Wallis and Futuna,0,WLF,0,
Samoa,0,WSM,0,8868.00
Yemen,0,YEM,0,
Zambia,0,ZMB,0,289584.00


### Import Data

In [33]:
'''scrape wine region data with coresponding wines'''
req = Request(IMPORT_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [34]:
import_tables = soup.find_all("div", class_="country-name")

In [35]:
import_country = []
for unit in import_tables:
    import_country.append(unit.text)

In [36]:
import_country_df = pd.Series(import_country).str.strip().rename_axis("index").rename("Country")
import_country_df.to_frame()

Unnamed: 0_level_0,Country
index,Unnamed: 1_level_1
0,United States
1,United Kingdom
2,China
3,Germany
4,Canada
...,...
151,Palestine
152,Brunei
153,Montserrat
154,Gambia


In [37]:
import_values = soup.find_all("td", class_="last-value")

In [38]:
import_string = []
for unit in import_values:
    import_string.append(unit.text)

In [39]:
import_values = []
for unit in import_string:
    import_values.append(unit.strip().replace(",",""))

In [40]:
import_value_df = pd.Series(import_values).str.strip().rename_axis("index").rename("Imports")
import_value_df.to_frame()

Unnamed: 0_level_0,Imports
index,Unnamed: 1_level_1
0,4646536989.52
1,2599546724.13
2,2374025494.80
3,1809853200.33
4,1657432367.71
...,...
151,13599.00
152,13244.00
153,7328.00
154,2045.00


In [41]:
import_df = pd.merge(import_country_df,import_value_df, on="index").set_index("Country")
import_df

Unnamed: 0_level_0,Imports
Country,Unnamed: 1_level_1
United States,4646536989.52
United Kingdom,2599546724.13
China,2374025494.80
Germany,1809853200.33
Canada,1657432367.71
...,...
Palestine,13599.00
Brunei,13244.00
Montserrat,7328.00
Gambia,2045.00


In [42]:
world_wine_import = pd.merge(world_wine_export, import_df, on="Country", how="left")
world_wine_import

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports,Imports
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,54.8,ITA,702,4771361172.94,68575684.80
France,49.1,FRA,789,5804650668.68,462240240.98
Spain,44.4,ESP,969,2054743135.27,84317181.86
United States,23.9,USA,430,1288960997.30,4646536989.52
Argentina,14.5,ARG,219,770027827.77,2563509.00
...,...,...,...,...,...
Wallis and Futuna,0,WLF,0,,
Samoa,0,WSM,0,8868.00,15400.14
Yemen,0,YEM,0,,
Zambia,0,ZMB,0,289584.00,2519971.00


### Consumption Data

In [43]:
'''scrape wine region data with coresponding wines'''
req = Request(CONSUMPTION_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [44]:
consumption_tables = soup.find_all("div", class_="country-name")

In [45]:
consumption_country = []
for consumption in consumption_tables:
    consumption_country.append(consumption.text)

In [46]:
consumption_country_df = pd.Series(consumption_country).str.strip().rename_axis("index").rename("Country")
consumption_country_df.to_frame()

Unnamed: 0_level_0,Country
index,Unnamed: 1_level_1
0,United States
1,France
2,Italy
3,Germany
4,China
5,United Kingdom
6,Spain
7,Argentina
8,Russia
9,Australia


In [47]:
consumption_values = soup.find_all("td", class_="last-value")

In [48]:
consumption_string = []
for consumption in consumption_values:
    consumption_string.append(consumption.text)

In [49]:
consumption_value = []
for consumption in consumption_string:
    consumption_value.append(consumption.strip().replace(",",""))

In [50]:
consumption_value_df = pd.Series(consumption_value).str.strip().rename_axis("index").rename("Consumption")
consumption_value_df.to_frame()

Unnamed: 0_level_0,Consumption
index,Unnamed: 1_level_1
0,32597.0
1,26196.0
2,21209.0
3,20356.0
4,18776.0
5,13045.0
6,9445.0
7,9351.0
8,9202.0
9,5511.0


In [51]:
consumption_df = pd.merge(consumption_country_df,consumption_value_df, on="index").set_index("Country")
consumption_df = consumption_df.astype(float)/1000

In [52]:
world_wine_data = pd.merge(world_wine_import, consumption_df, on="Country", how="left").fillna(0)
world_wine_data

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports,Imports,Consumption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Italy,54.8,ITA,702,4771361172.94,68575684.80,21.209
France,49.1,FRA,789,5804650668.68,462240240.98,26.196
Spain,44.4,ESP,969,2054743135.27,84317181.86,9.445
United States,23.9,USA,430,1288960997.30,4646536989.52,32.597
Argentina,14.5,ARG,219,770027827.77,2563509.00,9.351
...,...,...,...,...,...,...
Wallis and Futuna,0,WLF,0,0,0,0.000
Samoa,0,WSM,0,8868.00,15400.14,0.000
Yemen,0,YEM,0,0,0,0.000
Zambia,0,ZMB,0,289584.00,2519971.00,0.000


In [53]:
world_wine_data["Exports"] = world_wine_data["Exports"].astype(float)
world_wine_data["Imports"].astype(float)

Country
Italy                6.857568e+07
France               4.622402e+08
Spain                8.431718e+07
United States        4.646537e+09
Argentina            2.563509e+06
                         ...     
Wallis and Futuna    0.000000e+00
Samoa                1.540014e+04
Yemen                0.000000e+00
Zambia               2.519971e+06
Zimbabwe             2.639371e+06
Name: Imports, Length: 248, dtype: float64

In [54]:
world_wine_data

Unnamed: 0_level_0,Wine Production,CODES,Largest Vineyards,Exports,Imports,Consumption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Italy,54.8,ITA,702,4.771361e+09,68575684.80,21.209
France,49.1,FRA,789,5.804651e+09,462240240.98,26.196
Spain,44.4,ESP,969,2.054743e+09,84317181.86,9.445
United States,23.9,USA,430,1.288961e+09,4646536989.52,32.597
Argentina,14.5,ARG,219,7.700278e+08,2563509.00,9.351
...,...,...,...,...,...,...
Wallis and Futuna,0,WLF,0,0.000000e+00,0,0.000
Samoa,0,WSM,0,8.868000e+03,15400.14,0.000
Yemen,0,YEM,0,0.000000e+00,0,0.000
Zambia,0,ZMB,0,2.895840e+05,2519971.00,0.000


In [55]:
world_wine_data.to_csv("world_wine_data.csv")