In [135]:
import json

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import urllib.request
import requests

from urllib import parse

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 180)

In [141]:
all_life = pd.read_csv('input/LIFE-projects.csv')
all_life.shape

(4918, 12)

In [142]:
all_life.columns

Index(['Project_webSummary', ' Project Title', ' Project N�',
       ' Project Website', ' Year Of Finance', ' Lead Partner Country',
       ' Type Of Beneficiary', ' Country', ' Themes', ' Keywords', ' Habitats',
       ' Species'],
      dtype='object')

In [143]:
all_life.rename(columns={ 
    all_life.columns[0]: "project_websummary",
    all_life.columns[1]: "project_title",
    all_life.columns[2]: "project_no",
    all_life.columns[3]: "project_website",
    all_life.columns[4]: "year_of_finance",
    all_life.columns[5]: "lead_partner_country",
    all_life.columns[6]: "type_of_beneficiary",
    all_life.columns[7]: "country",
    all_life.columns[8]: "themes",
    all_life.columns[9]: "keywords",
    all_life.columns[10]: "habitats",
    all_life.columns[11]: "species"
}, inplace=True)

In [144]:
all_life['base_url'] = 'http://ec.europa.eu/environment/life/project/Projects/'

In [145]:
all_life['project_title'].nunique()

4874

In [146]:
all_life.head(2)

Unnamed: 0,project_websummary,project_title,project_no,project_website,year_of_finance,lead_partner_country,type_of_beneficiary,country,themes,keywords,habitats,species,base_url
0,index.cfm?fuseaction=search.dspPage&n_proj_id=6698,LIFE DICCA - Climate Change Adaption of the Ecosystem Danube Island,LIFE17 CCA/AT/000077,,2017,AT,Local authority,Österreich,,,,,http://ec.europa.eu/environment/life/project/Projects/
1,index.cfm?fuseaction=search.dspPage&n_proj_id=6699,LIFE UrbanStorm - Development of sustainable and climate resilient urban storm water management systems for Nordic municipalities,LIFE17 CCA/EE/000122,,2017,EE,Local authority,Estonia Eesti,,,,,http://ec.europa.eu/environment/life/project/Projects/


In [147]:
all_life.tail(2)

Unnamed: 0,project_websummary,project_title,project_no,project_website,year_of_finance,lead_partner_country,type_of_beneficiary,country,themes,keywords,habitats,species,base_url
4916,index.cfm?fuseaction=search.dspPage&n_proj_id=1271,XXX,LIFE91 ENV/D/NBL/5/5,,1991,DE,,Deutschland,,,,,http://ec.europa.eu/environment/life/project/Projects/
4917,index.cfm?fuseaction=search.dspPage&n_proj_id=994,Pre-feasibility study of the ODER/ODRA river basin,LIFE91 ENV/D/ODER,,1991,DE,,Deutschland,,,,,http://ec.europa.eu/environment/life/project/Projects/


In [148]:
all_life['project_url'] = all_life['base_url'] + all_life['project_websummary']

In [149]:
def get_param_from_url(url, param_name):
    return [i.split("=")[-1] for i in url.split("?", 1)[-1].split("&") if i.startswith(param_name + "=")][0]

In [150]:
all_life['project_id'] = all_life.apply(lambda row: get_param_from_url(row['project_url'], 'n_proj_id'), axis=1)

In [152]:
all_life.head(2)

Unnamed: 0,project_websummary,project_title,project_no,project_website,year_of_finance,lead_partner_country,type_of_beneficiary,country,themes,keywords,habitats,species,base_url,project_url,project_id
0,index.cfm?fuseaction=search.dspPage&n_proj_id=6698,LIFE DICCA - Climate Change Adaption of the Ecosystem Danube Island,LIFE17 CCA/AT/000077,,2017,AT,Local authority,Österreich,,,,,http://ec.europa.eu/environment/life/project/Projects/,http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=6698,6698
1,index.cfm?fuseaction=search.dspPage&n_proj_id=6699,LIFE UrbanStorm - Development of sustainable and climate resilient urban storm water management systems for Nordic municipalities,LIFE17 CCA/EE/000122,,2017,EE,Local authority,Estonia Eesti,,,,,http://ec.europa.eu/environment/life/project/Projects/,http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=6699,6699


Now need to examine web page to see if we attempt geolocation using Natura 2000 dataset. Project id 6699 has none, 6812 has a few

In [153]:
all_life['project_url'].values[1]

'http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=6699'

In [154]:
check_url = all_life['project_url'].values[4]
check_url 

'http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=6710'

In [156]:
raw_contents = urllib.request.urlopen('http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=6812')

In [157]:
charset=raw_contents.info().get_content_charset()
contents=raw_contents.read().decode(charset)

In [158]:
soup = BeautifulSoup(contents, 'html5lib')

In [159]:
# soup.prettify()

In [160]:
natura_span=soup.find('span',string='Natura 2000 sites')
natura_span

<span class="txtheadergreen">Natura 2000 sites</span>

In [161]:
natura_table1=natura_span.findNext('table')
natura_table1

<table border="0" cellpadding="0" cellspacing="0" width="100%">
									
											
												<tbody><tr><td valign="top">SPA</td>
												<td valign="top">UK9010101</td>
												<td valign="top">Dorset Heathlands</td>
											</tr>
										
											
												<tr><td valign="top">SCI</td>
												<td valign="top">UK0012559</td>
												<td valign="top">Penhale Dunes</td>
											</tr>
										
											
												<tr><td valign="top">SCI</td>
												<td valign="top">UK0012570</td>
												<td valign="top">Braunton Burrows</td>
											</tr>
										
											
												<tr><td valign="top">SCI</td>
												<td valign="top">UK0013025</td>
												<td valign="top">Solway Firth</td>
											</tr>
										
											
												<tr><td valign="top">SCI</td>
												<td valign="top">UK0013027</td>
												<td valign="top">Morecambe Bay</td>
											</tr>
										
											
												<tr><td valign="top

If the first table has 3 elements then it appears to be single row col containing 'Not applicable'

In [162]:
table_rows = natura_table1.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

In [164]:
natura_project = pd.DataFrame(res, columns=["area_type", "area_code", "area_name"])
natura_project.head(6)

Unnamed: 0,area_type,area_code,area_name
0,SPA,UK9010101,Dorset Heathlands
1,SCI,UK0012559,Penhale Dunes
2,SCI,UK0012570,Braunton Burrows
3,SCI,UK0013025,Solway Firth
4,SCI,UK0013027,Morecambe Bay
5,SCI,UK0013031,Drigg Coast


TODO: We will need to split the budget across this many areas

Find the Long and Lat for this Natura area

In [230]:
def findLatAndLong(natura_area_code):
    # http://ec.europa.eu/environment/life/project/Projects/index.cfm
    all_natura_sites = pd.read_csv('input/NATURA2000SITES.csv')
    uk_natura_sites = all_natura_sites[all_natura_sites.COUNTRY_CODE=='uk'].copy()
    # Stip out spurious columns
    uk_natura_sites_reduced = uk_natura_sites[['SITECODE', 'SITENAME', 'LONGITUDE','LATITUDE']].copy()
    uk_natura_sites_reduced.rename(columns={ 
        uk_natura_sites_reduced.columns[0]: "site_code",
        uk_natura_sites_reduced.columns[1]: "site_name",
        uk_natura_sites_reduced.columns[2]: "longitude",
        uk_natura_sites_reduced.columns[3]: "latitude"
        }, inplace=True)
    area_code_row=uk_natura_sites_reduced[uk_natura_sites_reduced.site_code==natura_area_code].copy()
    longitude = area_code_row['longitude'].values[0]
    latitude = area_code_row['latitude'].values[0]
    return {latitude, longitude}

In [231]:
findLatAndLong('UK0013027')

{-2.961667, 54.119167000000004}

In [249]:
def findNearestPostcode(lat, long):
    url = 'http://api.postcodes.io/postcodes'
    data = '''{
        "geolocations": [{
        "longitude": ''' + str(long) + ''',
        "latitude": ''' + str(lat) + ''',
        "radius": 10000,
        "limit": 1
        }]
    }'''
    dump=json.dumps(data, indent=4, sort_keys=True)
    print(dump)
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    response = requests.post(url, data=data, headers=headers)
    postcode = response.json()['result'][0]['result'][0]['postcode']
    return postcode

In [251]:
#findNearestPostcode(57.67,-3.116667)
findNearestPostcode(54.119167000000004,-2.961667)

"{\n        \"geolocations\": [{\n        \"longitude\": -2.961667,\n        \"latitude\": 54.119167000000004,\n        \"radius\": 10000,\n        \"limit\": 1\n        }]\n    }"


TypeError: 'NoneType' object is not subscriptable