### 1. Introduction
The objective is to create a spatial time series dataset of European Bank for Reconstruction and Development (EBRD) projects. 

Firstly, the code wrangles folllowing information from 4000+ EBRD project htmls: https://www.ebrd.com/work-with-us/project-finance/project-summary-documents.html

        1. Publication Date
        2. ProjectID
        3. Country
        4. Title
        5. Sector
        6. Project Type
        7. Project Status
        8. Link
        
        and additionally :
        9. Environmental Category
        10. Approval Date
        11. PSD disclosed date
        12. Project Description
        13. Total Project Cost
        14. location(currently, there is no location data on web)


Secondly, the code uses the geographic location to identify latitude/longitude, using the WorldCities database on Box.

Thirdly, the code uses the latitude/longitude to assign PRIO GRID IDs.
 
Also, this code creates an indicator variable for whether the most specific location mentioned is at the country-level, as indicated in the toy dataset. 

Note: Do NOT add PRIO GRIDS unless there is a location more precise than country level!!
 

In [1]:
import requests
from bs4 import BeautifulSoup
import lxml
from lxml import html
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC  
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from lxml.html.soupparser import fromstring
import pandas as pd
import numpy as np
import random
import os
import glob
from geopy.geocoders import Nominatim

### 2. Initial Web Scraping & Data Wrangling

In [2]:
# read raw data
raw_df = pd.read_csv("../input/downloadfile.csv")
print(raw_df.columns)
print(raw_df.shape)
raw_df.drop_duplicates(subset=[" ProjectID"], inplace=True)
print(raw_df.shape)
raw_df[" ProjectID"] = raw_df[" ProjectID"].astype("int32")

Index(['Publication Date', ' ProjectID', ' Country', ' Title', ' Sector',
       ' Project Type', ' Project Status', ' Link'],
      dtype='object')
(4401, 8)
(4332, 8)


In [3]:
link = [] 
wrong_html_problem_ls = [] 
# some links in raw dataset need to be modified before scraping 
def generate_link(raw_link): 
    if raw_link.startswith("https"):
        l = raw_link
        link.append(l)
    else:
        if raw_link.startswith("/cs") or raw_link.startswith("/sites"):
            wrong_html_problem_ls.append(raw_link)
        else:
            l = "https:" + raw_link
            link.append(l)

In [4]:
raw_df[" Link"].apply(lambda x: generate_link(x))
print(len(link))
print(len(wrong_html_problem_ls))

4320
12


In [5]:
problem_html = []

In [6]:
problem_html = []
chucksize = 100 
for iteration in range(0,44):
    # list stored all results for all htmls
    project_id_ls = []
    env_cagetory_ls = []
    approval_date_ls = []
    psd_disclosed_ls = []
    project_des_ls = []
    total_cost_ls = []
    location_ls = []
    
    for html in link[iteration*chucksize:(iteration+1)*chucksize]:
        try:
            req = requests.get(html)
            soup = BeautifulSoup(req.content, "html.parser")
            dom = etree.HTML(str(soup))

            location_count = 0
            project_id_count = 0
            env_cagetory_count = 0 
            approval_date_count = 0 
            psd_disclosed_count = 0 
            project_des_count = 0
            total_cost_count = 0 

            legend_ls = soup.find_all("legend")
            length_1 = len(soup.find_all("legend"))

            for j in range(0,length_1):
                legend_title = legend_ls[j].text.strip().lower()
                value = legend_ls[j].find_next().text.strip()

                if legend_title.startswith("location"): 
                    location_ls.append(value)
                    location_count = 1
                    continue

                if legend_title.startswith("project number"): 
                    project_id_ls.append(value)
                    project_id_count = 1
                    continue

                if legend_title.startswith("environmental category"): 
                    env_cagetory_ls.append(value)
                    env_cagetory_count = 1
                    continue

                if legend_title.startswith("approval date"): 
                    approval_date_ls.append(value)
                    approval_date_count = 1
                    continue

                if legend_title.startswith("psd disclosed"): 
                    psd_disclosed_ls.append(value)  
                    psd_disclosed_count = 1
                    continue

            h2_ls = soup.find_all("article")[0].find_all("h2")
            lenth_2 = len(h2_ls)
            for m in range(0, lenth_2):
                h2_title = h2_ls[m].text.strip().lower() 
    
                if h2_title.startswith("project description"):
                    des = h2_ls[m].find_next().text.strip()
                    project_des_ls.append(des)
                    project_des_count = 1
                    continue    

                if h2_title.startswith("project cost") or h2_title.startswith("total project cost"):
                    cost = h2_ls[m].find_next().text.strip()     
                    total_cost_ls.append(cost)
                    total_cost_count = 1
                    continue  
            
            # if can't find any information            
            if location_count == 0:
                location_count.append(np.nan)
            if project_id_count == 0:
                problem_html.append(html)
            if env_cagetory_count == 0:
                env_cagetory_ls.append(np.nan)
            if approval_date_count == 0:
                approval_date_ls.append(np.nan)
            if psd_disclosed_count == 0:
                psd_disclosed_ls.append(np.nan)
            if project_des_count == 0: 
                project_des_ls.append(np.nan)
            if total_cost_count == 0:
                total_cost_ls.append(np.nan)

        except:   
            print(html) # print the wrong html
            problem_html.append(html)
            project_id_ls.append(html)
            location_ls.append(np.nan)
            env_cagetory_ls.append(np.nan)
            approval_date_ls.append(np.nan)
            psd_disclosed_ls.append(np.nan)
            project_des_ls.append(np.nan)
            total_cost_ls.append(np.nan)
            
    d = {
        "Location": location_ls, 
        "Project number": project_id_ls, 
        "Environmental category":env_cagetory_ls, 
        "Approval date": approval_date_ls,
        "PSD disclosed": psd_disclosed_ls,
        "Project description": project_des_ls,
        "Total project cost": total_cost_ls,
        "Link": link[iteration*chucksize:(iteration+1)*chucksize], 
     }
    
    df = pd.DataFrame(data = d)
    store_path = "../output/webscraping_"+str(iteration)+".csv"
    df.to_csv(store_path)
    time.sleep(random.randint(4,16))

In [None]:
https://www.ebrd.com/work-with-us/projects/psd/burgas-water-company.html

In [None]:
# verification with single html

project_id_ls = []
env_cagetory_ls = []
approval_date_ls = []
psd_disclosed_ls = []
project_des_ls = []
total_cost_ls = []
location_ls = []

req = requests.get("https://www.ebrd.com/work-with-us/projects/psd/tredas-financing.html")
soup = BeautifulSoup(req.content, "html.parser")
dom = etree.HTML(str(soup))
env_cagetory_count = 0 
approval_date_count = 0 
psd_disclosed_count = 0 
project_des_count = 0
total_cost_count = 0 

length_1 = len(soup.find_all("legend"))
legend_ls = soup.find_all("legend")

for j in range(0,length_1):
    legend_title = legend_ls[j].text.strip().lower()
    value = legend_ls[j].find_next().text.strip()
    print(legend_title)
    print(value)
    
    if legend_title.startswith("project number"): 
        project_id_ls.append(value)
        project_id_count = 1
        continue

    if legend_title.startswith("environmental category"): 
        env_cagetory_ls.append(value)
        env_cagetory_count = 1
        continue

    if legend_title.startswith("approval date"): 
        approval_date_ls.append(value)
        approval_date_count = 1
        continue

    if legend_title.startswith("psd disclosed"): 
        psd_disclosed_ls.append(value)  
        psd_disclosed_count = 1
        continue


h2_ls = soup.find_all("article")[0].find_all("p")
lenth_2 = len(h2_ls)
for m in range(0, lenth_2):
    h2_title = h2_ls[m].text.strip().lower()
    if h2_title.startswith("project description"):
        print(h2_ls[m])
        des = h2_ls[m].find_next_sibling('p').text.strip()
        print("des:", h2_title, ": ", des,"/n")    
        continue

    if h2_title.startswith("project cost") or h2_title.startswith("total project cost"):
        print(h2_title)
        cost = h2_ls[m].find_next_sibling('p').text.strip()  
        print("des:", h2_title, ": ", cost,"/n")    
        continue
    
# d = {"Project number": project_id_ls, 
#      "Environmental category":env_cagetory_ls, 
#      "Approval date": approval_date_ls,
#      "PSD disclosed": psd_disclosed_ls,
#      "Project description": project_des_ls,
#      "Total cost": total_cost_ls
#      }
# df = pd.DataFrame(data = d)
# df.head()

location:
Turkey
project number:
48387
business sector:
Energy
notice type:
Private
environmental category:
B
approval date:
19 Oct 2016
status:
Repaying
psd disclosed:
08 Jun 2016
<p align="LEFT">
<b><font face="Arial" size="5"><font face="Arial" size="5">Project Description</font></font></b></p>
des: project description :  A senior loan to Trakya electricity distribution company (TREDAS), Trakya electricity retail company (TREPAS), and to their parent company, IC Ictas Elektrik (ICEL) (together the co-borrowers). The transaction is part of an approximately USD 685 million dual currency financing package to be used for (i) refinancing existing financing package (acquisition of TREDAS, working capital and existing CAPEX) and (ii) financing new CAPEX for the 2016-2020 tariff period. /n
total project cost
des: total project cost :  USD 684,250,000.00 /n


### 3. Refining Web Scraping results

In [None]:
# read web scraping results
path = r'../output/' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))
print(all_files)
ls = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    ls.append(df)
# concate the result to one df
pre_output_file = pd.concat(ls, axis=0, ignore_index=True).drop(columns=["Unnamed: 0"])
print(pre_output_file.shape)
pre_output_file.head()
pre_output_file.to_csv("../refine_output/pre_output_file.csv")

['../output/webscraping_10.csv', '../output/webscraping_38.csv', '../output/webscraping_39.csv', '../output/webscraping_11.csv', '../output/webscraping_13.csv', '../output/webscraping_12.csv', '../output/webscraping_16.csv', '../output/webscraping_17.csv', '../output/webscraping_29.csv', '../output/webscraping_15.csv', '../output/webscraping_14.csv', '../output/webscraping_28.csv', '../output/webscraping_2.csv', '../output/webscraping_3.csv', '../output/webscraping_1.csv', '../output/webscraping_49.csv', '../output/webscraping_4.csv', '../output/webscraping_5.csv', '../output/webscraping_48.csv', '../output/webscraping_7.csv', '../output/webscraping_6.csv', '../output/webscraping_52.csv', '../output/webscraping_46.csv', '../output/webscraping_47.csv', '../output/webscraping_53.csv', '../output/webscraping_45.csv', '../output/webscraping_51.csv', '../output/webscraping_8.csv', '../output/webscraping_9.csv', '../output/webscraping_50.csv', '../output/webscraping_44.csv', '../output/websc

In [None]:
# find the results where there are no project description
no_des_data = pre_output_file[pre_output_file[ "Project description"].isna()]
no_des_data.to_csv("../refine_output/no_description.csv")
no_des_link  = no_des_data["Link"].to_list()
print(len(no_des_link))

0
0


[]

In [None]:
# scrape the result again for non-description results
problem_html = []

# list stored all results for all htmls
project_id_ls = []
env_cagetory_ls = []
approval_date_ls = []
psd_disclosed_ls = []
project_des_ls = []
total_cost_ls = []
location_ls = []

for html in no_des_link:
    try:
        req = requests.get(html)
        soup = BeautifulSoup(req.content, "html.parser")
        dom = etree.HTML(str(soup))

        location_count = 0
        project_id_count = 0
        env_cagetory_count = 0 
        approval_date_count = 0 
        psd_disclosed_count = 0 
        project_des_count = 0
        total_cost_count = 0 

        legend_ls = soup.find_all("legend")
        length_1 = len(soup.find_all("legend"))

        for j in range(0,length_1):
            legend_title = legend_ls[j].text.strip().lower()
            value = legend_ls[j].find_next().text.strip()

            if legend_title.startswith("location"): 
                location_ls.append(value)
                location_count = 1
                continue

            if legend_title.startswith("project number"): 
                project_id_ls.append(value)
                project_id_count = 1
                continue

            if legend_title.startswith("environmental category"): 
                env_cagetory_ls.append(value)
                env_cagetory_count = 1
                continue

            if legend_title.startswith("approval date"): 
                approval_date_ls.append(value)
                approval_date_count = 1
                continue

            if legend_title.startswith("psd disclosed"): 
                psd_disclosed_ls.append(value)  
                psd_disclosed_count = 1
                continue
        
        # change where happens in webs
        p_ls = soup.find_all("article")[0].find_all("p")
        lenth_2 = len(p_ls)
        for m in range(0, lenth_2):
            h2_title = p_ls[m].text.strip().lower() 

            if h2_title.startswith("project description"):
                des =  p_ls[m].find_next_sibling("p").text.strip() # change where happens
                project_des_ls.append(des)
                project_des_count = 1
                continue    

            if h2_title.startswith("project cost") or h2_title.startswith("total project cost"):
                cost = p_ls[m].find_next_sibling("p").text.strip()     
                total_cost_ls.append(cost)
                total_cost_count = 1
                continue  
        
        # if can't find any information            
        if location_count == 0:
            location_count.append(np.nan)
        if project_id_count == 0:
            problem_html.append(html)
        if env_cagetory_count == 0:
            env_cagetory_ls.append(np.nan)
        if approval_date_count == 0:
            approval_date_ls.append(np.nan)
        if psd_disclosed_count == 0:
            psd_disclosed_ls.append(np.nan)
        if project_des_count == 0: 
            project_des_ls.append(np.nan)
        if total_cost_count == 0:
            total_cost_ls.append(np.nan)

    except:   
        print(html) # print the wrong html
        problem_html.append(html)
        project_id_ls.append(html)
        location_ls.append(np.nan)
        env_cagetory_ls.append(np.nan)
        approval_date_ls.append(np.nan)
        psd_disclosed_ls.append(np.nan)
        project_des_ls.append(np.nan)
        total_cost_ls.append(np.nan)

d = {"Project number": project_id_ls, 
     "Environmental category":env_cagetory_ls, 
     "Approval date": approval_date_ls,
     "PSD disclosed": psd_disclosed_ls,
     "Project description": project_des_ls,
     "Total project cost": total_cost_ls,
     "Link": no_des_link,
     }
df = pd.DataFrame(data = d)
df.to_csv("../refine_output/refined_df_1.csv")
df.head()


In [None]:
# todo: need human labor due to the various changes in web formats
# fill in project description and cost by hand
refine_Data = pd.read_csv("../refine_output/refine_df_2.csv").drop(columns=["Unnamed: 0"])
refine_Data.head()

In [None]:
pre_output_file = pre_output_file.dropna(subset=["Project description"])
web_scraping_result = pd.concat([refine_Data, pre_output_file])
web_scraping_result["Project number"] = web_scraping_result["Project number"].astype("int32")
print(web_scraping_result.shape)
web_scraping_result.columns

In [None]:
# read raw data
raw_df = pd.read_csv("../input/downloadfile.csv")
print(raw_df.columns)
print(raw_df.shape)
raw_df.drop_duplicates(subset=[" ProjectID"], inplace=True)
raw_df[" ProjectID"] = raw_df[" ProjectID"].astype("int32")
print(raw_df.shape)


In [None]:
# merge the web scraping results and raw data
merge_df = web_scraping_result.merge(raw_df, how = "right", left_on="Project number", right_on=" ProjectID").drop(columns=["Link", "Project number"])
merge_df[" ProjectID"] = merge_df[" ProjectID"].astype("int32")
print(merge_df.shape)
merge_df.head()

In [None]:
merge_df.to_csv("../refine_output/merge_df.csv")

In [None]:
# todo: need human labor due to the various changes in web formats
# fill in location by hand
merge_df = pd.read_csv("../refine_output/merge_df_2.csv").drop(columns=["Unnamed: 0"])
print(merge_df.shape)

### 4. Check if the location is the country level

In [None]:
merge_df["country-level"] = merge_df[["Location"," Country"]].apply(lambda x: 1 if x.Location  == x[" Country"]  else 0, axis = 1)
merge_df_ls = np.unique(merge_df["Location"])
merge_df_ls

### 5. Find the lat/long based on country name

In [None]:
geo_dict = dict()

for i in merge_df_ls:
    geolocator = Nominatim(user_agent="find")
    loc = geolocator.geocode(i)
    try:
        lat = loc.latitude
        long = loc.longitude
    except:
        print(i)
        lat = np.nan
        long = np.nan
    geo_dict[i]=(lat, long)

In [None]:
# add the geolocation by hand
geo_dict["Regional"] = (np.nan, np.nan)
geo_dict["FYR Macedonia"] = (41.815338, 21.406864)

In [None]:
def find_lat(x):
    lat, long = geo_dict[x]
    return lat

def find_long(x):
    lat, long = geo_dict[x]
    return long

In [None]:
merge_df["latitude"]= merge_df["Location"].apply(lambda x: find_lat(x))
merge_df["longitude"]= merge_df["Location"].apply(lambda x: find_long(x))
merge_df.head()

In [None]:
# check the nan data
merge_df[merge_df["latitude"].isna()][" Country"].to_list()

In [None]:
merge_df.to_csv("../refine_output/final_Result.csv")