In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import eurostat

url=f"https://appsso.eurostat.ec.europa.eu/nui/print.do"

In [2]:
df = eurostat.get_data_df('edat_lfse_04')
df.columns = df.columns.astype(str)
# Drop all years before 2012 and columns unit and age (same for all entries)
df = df.drop(df.loc[:, '2011': '2000'].columns, axis = 1)
df = df.drop(['unit', 'age'], axis = 1)

# Keep only total values and discard the gender column
df = df[df.sex == 'T']
df = df.drop('sex', axis = 1)



In [3]:
# Rename to avoid problems using \
df = df.rename(columns={'geo\\time': ' NUTS 2'})

In [4]:
# Merge on all entries which are also in the target variable cities to extract only the interesting cities
target_cities = pd.read_csv("Cities_with_codes.csv")
education_attainment = pd.merge(target_cities, df, on=[' NUTS 2'])
education_attainment

Unnamed: 0,City,City Code,NUTS 2,Country,isced11,2019,2018,2017,2016,2015,2014,2013,2012
0,Amsterdam,NL002C1,NL32,NL,ED0-2,17.9,18.1,18.4,19.8,20.2,21.2,21.0,23.4
1,Amsterdam,NL002C1,NL32,NL,ED3-8,82.1,81.9,81.6,80.2,79.8,78.8,79.0,76.6
2,Amsterdam,NL002C1,NL32,NL,ED3_4,33.7,36.6,37.2,37.3,38.2,38.0,39.8,38.0
3,Amsterdam,NL002C1,NL32,NL,ED5-8,48.5,45.4,44.4,42.9,41.6,40.8,39.2,38.5
4,Ankara,TR001C1,TR51,TR,ED0-2,46.2,47.9,49.4,49.6,49.9,50.4,51.4,52.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,Zagreb,HR001C1,HR04,HR,ED5-8,24.3,24.8,23.6,23.1,21.8,20.8,19.2,17.6
316,Zurich,CH001C1,CH04,CH,ED0-2,8.9,9.4,9.5,10.1,11.2,11.3,11.3,11.7
317,Zurich,CH001C1,CH04,CH,ED3-8,91.1,90.6,90.5,89.9,88.8,88.7,88.7,88.3
318,Zurich,CH001C1,CH04,CH,ED3_4,36.8,38.2,38.8,39.2,40.3,42.9,43.6,45.8


In [5]:
# Check for missing values
print('Missing values for columns:')
def NaN_percent(df, column_name):
    row_count = df[column_name].shape[0]
    empty_values = row_count - df[column_name].count()
    return (100.0*empty_values)/row_count
for i in list(education_attainment):
    print("%s: %.2f%%" % (i, NaN_percent(education_attainment,i)))


Missing values for columns:
City: 0.00%
 City Code: 0.00%
 NUTS 2: 0.00%
 Country: 0.00%
isced11: 0.00%
2019: 1.25%
2018: 1.25%
2017: 1.25%
2016: 1.25%
2015: 1.25%
2014: 1.25%
2013: 1.25%
2012: 7.50%


In [6]:
has_nan = education_attainment[education_attainment.isna().any(axis=1)]
has_nan

Unnamed: 0,City,City Code,NUTS 2,Country,isced11,2019,2018,2017,2016,2015,2014,2013,2012
28,Belgrade,-,RS11,RS,ED0-2,7.8,7.8,8.8,9.4,9.6,10.5,11.5,
29,Belgrade,-,RS11,RS,ED3-8,92.2,92.2,91.2,90.6,90.4,89.5,88.5,
30,Belgrade,-,RS11,RS,ED3_4,53.8,54.9,55.0,56.7,56.1,56.2,57.1,
31,Belgrade,-,RS11,RS,ED5-8,38.4,37.4,36.2,33.9,34.3,33.3,31.4,
72,Budapest,HU001C1,HU11,HU,ED0-2,6.9,7.1,7.8,8.4,7.9,8.3,8.0,
73,Budapest,HU001C1,HU11,HU,ED3-8,93.1,92.9,92.2,91.6,92.1,91.7,92.0,
74,Budapest,HU001C1,HU11,HU,ED3_4,45.6,46.4,50.4,49.8,47.4,49.1,51.7,
75,Budapest,HU001C1,HU11,HU,ED5-8,47.5,46.5,41.7,41.9,44.7,42.5,40.4,
112,Edinburgh,UK007C1,UKM7,UK,ED0-2,17.2,17.1,16.5,16.6,16.7,16.8,17.4,
113,Edinburgh,UK007C1,UKM7,UK,ED3-8,82.8,82.9,83.5,83.4,83.3,83.2,82.6,


In [7]:
Glasgow = education_attainment.loc[education_attainment['City'] == 'Glasgow']
# Whole region of Scotland is UKM
# Glasgow is 40% of the population it should be a reasonable approximation
temp  = df[df[' NUTS 2'] == 'UKM']
temp = temp.loc[:,'2019':'2013']
Glasgow.loc[:, '2019':'2013'] = temp.loc[:, '2019':'2013'].to_numpy()
education_attainment[education_attainment['City'] == 'Glasgow'] = Glasgow

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [8]:
# Impute the rest using padding
education_attainment = education_attainment.interpolate(method='pad')
print('Dataframe has NaN values: ', education_attainment.isnull().values.any())


Dataframe has NaN values:  False


In [9]:
# Split in to 3 separate tables depending on education level
edu_ED_0_2 = education_attainment[education_attainment['isced11'] == 'ED0-2']
edu_ED_3_4 = education_attainment[education_attainment['isced11'] == 'ED3_4']
edu_ED_5_8 = education_attainment[education_attainment['isced11'] == 'ED5-8']