In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import lxml
from urllib.request import Request, urlopen
import numpy as np

### URLs Used

In [2]:
WINE_REGION_URL2 = "https://www.tonymappedit.com/top-10-wine-producing-countries/"
WIKI_COUNTRIES = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3"
VINEYARD_URL = "https://www.bkwine.com/features/more/world-wine-production-reaches-record-level-2018-consumption-stable/"
EXPORT_URL = "https://www.nationmaster.com/nmx/ranking/export-of-fortified-wine-or-must"
IMPORT_URL = "https://www.nationmaster.com/nmx/ranking/import-of-fortified-wine-or-must"
CONSUMPTION_URL = "https://www.nationmaster.com/nmx/ranking/wine-consumption"

### Wine Production Data

In [None]:
'''scrape wine region data with coresponding wines'''
req = Request(WINE_REGION_URL2 , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [None]:
country1 = []
countries = soup.find_all("ol")[1]
items = countries.find_all("li")

for country in items:
    country1.append(country.text.split(": "))

In [None]:
country2 = []
countries2 = soup.find_all("ol")[2]
items = countries2.find_all("li")

for country in items:
    country2.append(country.text.split(": "))

In [None]:
country_production = country1 + country2
# country_production

In [None]:
country_production_df = pd.DataFrame(country_production).rename(columns = {0: "Country", 1: "Wine Production"}).set_index("Country")
country_production_df.astype("float64")

### Country Code Data

In [None]:
'''scrape wine region data with coresponding wines'''
req = Request(WIKI_COUNTRIES , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

# html_text = requests.get(WINE_REGION_URL).text
soup = BeautifulSoup(webpage, "html.parser")

In [None]:
table = soup.find("div", class_="plainlist")
country_codes = table.find_all("span")
countries = table.find_all("a")

In [None]:
country_list = []
for country in countries:
    country_list.append(country.text)

In [None]:
code_list = []
for code in country_codes:
    code_list.append(code.text)

In [None]:
#Province of China was index 229. 
country_df = pd.DataFrame(country_list).drop([229]).reset_index().drop(columns = ["index"])
country_df = country_df.rename(columns = {0: "Country"}).reset_index().set_index("index")

In [None]:
code_df = pd.DataFrame(code_list).reset_index().drop(columns = ["index"])
code_df = code_df.rename(columns = {0: "CODES"}).reset_index().set_index("index")

In [None]:
country_code_df = pd.merge(country_df, code_df, on="index", how="outer").set_index("Country").rename(index = {"United States of America" : "United States"})
country_code_df

In [None]:
wine_production = pd.merge(country_production_df,country_code_df,  how="right", on="Country").rename(columns = {'Alpha-3 code':'CODE'}).fillna(0)
wine_production

### Vineyard Data

In [None]:
'''scrape wine region data with coresponding wines'''
req = Request(VINEYARD_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [None]:
vineyard_table = soup.find_all("table", id="tablepress-762")[0]
vine_country = vineyard_table.find_all("td", class_="column-2")
vine_acreage = vineyard_table.find_all("td", class_="column-3")

In [None]:
vine_countries = []
for country in vine_country:
    vine_countries.append(country.text)

In [None]:
plot_sizes = []
for plot_size in vine_acreage:
    plot_sizes.append(plot_size.text)

In [None]:
country_vy = pd.DataFrame(vine_countries).drop([0,27,28,29,30]).reset_index().replace('\*','',regex=True).rename(columns = {0: "Country"})
size_vy = pd.DataFrame(plot_sizes).drop([0,27,28]).reset_index().rename(columns = {0: "Largest Vineyards"})

In [None]:
vineyard_data = pd.merge(country_vy, size_vy, on="index").drop(columns = ["index"]).set_index("Country").rename(index = {"USA" : "United States"})
vineyard_data

In [None]:
world_wine_vy = pd.merge(wine_production, vineyard_data, on="Country", how="left").fillna(0)
world_wine_vy

### Export Data

In [None]:
'''scrape wine region data with coresponding wines'''
req = Request(EXPORT_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [None]:
export_tables = soup.find_all("div", class_="country-name")

In [None]:
export_country = []
for export in export_tables:
    export_country.append(export.text)

In [None]:
export_country_df = pd.Series(export_country).str.strip().rename_axis("index").rename("Country")
export_country_df.to_frame()

In [None]:
export_values = soup.find_all("td", class_="last-value")

In [None]:
export_string = []
for export in export_values:
    export_string.append(export.text)

In [None]:
export_values = []
for e in export_string:
    export_values.append(e.strip().replace(",",""))

In [None]:
export_value_df = pd.Series(export_values).str.strip().rename_axis("index").rename("Exports")
export_value_df = export_value_df.to_frame()

In [None]:
export_df = pd.merge(export_country_df,export_value_df, on="index").set_index("Country")
export_df = export_df.astype(float)
export_df["Log_Exports"] = np.log(export_df["Exports"])
export_df

In [None]:
world_wine_export = pd.merge(world_wine_vy, export_df, on="Country", how="left")
world_wine_export

### Import Data

In [None]:
'''scrape wine region data with coresponding wines'''
req = Request(IMPORT_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [None]:
import_tables = soup.find_all("div", class_="country-name")

In [None]:
import_country = []
for unit in import_tables:
    import_country.append(unit.text)

In [None]:
import_country_df = pd.Series(import_country).str.strip().rename_axis("index").rename("Country")
import_country_df.to_frame()

In [None]:
import_values = soup.find_all("td", class_="last-value")

In [None]:
import_string = []
for unit in import_values:
    import_string.append(unit.text)

In [None]:
import_values = []
for unit in import_string:
    import_values.append(unit.strip().replace(",",""))

In [None]:
import_value_df = pd.Series(import_values).str.strip().rename_axis("index").rename("Imports")
import_value_df.to_frame()

In [None]:
import_df = pd.merge(import_country_df,import_value_df, on="index").set_index("Country")
import_df = import_df.astype(float)
import_df["Log_Imports"] = np.log(import_df["Imports"])
import_df

In [None]:
world_wine_import = pd.merge(world_wine_export, import_df, on="Country", how="left")
world_wine_import

### Consumption Data

In [None]:
'''scrape wine region data with coresponding wines'''
req = Request(CONSUMPTION_URL , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, "html.parser")

In [None]:
consumption_tables = soup.find_all("div", class_="country-name")

In [None]:
consumption_country = []
for consumption in consumption_tables:
    consumption_country.append(consumption.text)

In [None]:
consumption_country_df = pd.Series(consumption_country).str.strip().rename_axis("index").rename("Country")
consumption_country_df.to_frame()

In [None]:
consumption_values = soup.find_all("td", class_="last-value")

In [None]:
consumption_string = []
for consumption in consumption_values:
    consumption_string.append(consumption.text)

In [None]:
consumption_value = []
for consumption in consumption_string:
    consumption_value.append(consumption.strip().replace(",",""))

In [None]:
consumption_value_df = pd.Series(consumption_value).str.strip().rename_axis("index").rename("Consumption")
consumption_value_df.to_frame()

In [None]:
consumption_df = pd.merge(consumption_country_df,consumption_value_df, on="index").set_index("Country")
consumption_df = consumption_df.astype(float)/1000

In [None]:
world_wine_data = pd.merge(world_wine_import, consumption_df, on="Country", how="left").fillna(0)
world_wine_data

In [None]:
world_wine_data["Exports"] = world_wine_data["Exports"].astype(float)
world_wine_data["Imports"].astype(float)

In [None]:
world_wine_data

In [None]:
world_wine_data.to_csv("world_wine_data.csv")