Mapping GDP data with geographic data

In [1]:
import os
os.chdir("../")
%run config.ipynb
import os.path
from pandas import DataFrame
import pandas as pd

## loading economic data utils

In [2]:
# load rawData
def loadRawData(sheetName="Table 1", index='Region name'):
    rawData = pd.read_excel(rgdpPath, sheet_name=f"{sheetName}", skiprows=1, header=0, index_col=index)
    rawData.index = [item if item !="Yorkshire and The Humber" else "Yorkshire and the Humber" for item in rawData.index]
    rawData.columns = [int(item) if item.startswith('1') or item.startswith('2') else item for item in rawData.columns]
    return rawData

# realGDP = loadRawData('Table 12')
# len(realGDP.query("ITL == 'ITL3'"))

In [3]:
def getDataByITL(data:DataFrame, ITL="ITL3", startYear=1999, endYear=2022, isOriginalFormat=True):
    """
    query gdpGrowthRates by ITL
    ----------
    :param data: raw data
    :param ITL: ITL code
    :param isOriginalFormat: return data format
        True: original data format
        False: transformed data format
    """
    # filter ITL level
    subData = data.copy(deep=True)
    if ITL == "all":
        pass
    else:
        subData = data.query(f"ITL == '{ITL}'").copy()

    # filter year
    if startYear and endYear:
        subData.drop(columns=[year for year in range(1999, startYear)], axis=1, inplace=True)
        subData.drop(columns=[year for year in range(endYear+1, 2023)], axis=1, inplace=True)

    # transform data format
    if not isOriginalFormat:
        subData.reset_index(inplace=True)
        subData.rename(columns={'index': 'Region Name'}, inplace=True)

        subData = subData.melt(
            id_vars=['Region Name', 'ITL', 'ITL code'],
            value_vars=[y for y in range(startYear, endYear+1)],
            var_name='year',
            value_name='value'
        )
    print("getDataByITL shape:", subData.shape)
    return subData

# gdp_df = loadRawData(sheetName='Table 12')
# gdp_ITL3 = getDataByITL(data=gdp_df, ITL='ITL3', startYear=1999, endYear=2022, isOriginalFormat=False)
# gdp_ITL3.head()

## loading LAD(Local Authority District Code)

* loading LAD data

In [4]:
lads_df = pd.read_csv(ladPath, usecols=['ITL321CD', 'ITL321NM', 'LAD23CD', 'LAD23NM'])
print(lads_df.shape)
lads_df.head()

(388, 4)


Unnamed: 0,LAD23CD,LAD23NM,ITL321CD,ITL321NM
0,E06000001,Hartlepool,TLC11,Hartlepool and Stockton-on-Tees
1,E06000004,Stockton-on-Tees,TLC11,Hartlepool and Stockton-on-Tees
2,E06000002,Middlesbrough,TLC12,South Teesside
3,E06000003,Redcar and Cleveland,TLC12,South Teesside
4,E06000005,Darlington,TLC13,Darlington


In [5]:
print(len(lads_df['LAD23CD'].unique()))
print(len(lads_df['ITL321CD'].unique()))

361
179


- check duplicated data

In [6]:
# duplicated data number
print(lads_df['LAD23CD'].duplicated(keep=False).sum())
print(lads_df['ITL321CD'].duplicated(keep=False).sum())

37
297


In [7]:
# drop dumplicated data
lads_df.drop_duplicates(subset='LAD23CD', keep=False, inplace=True)
lads_df.drop_duplicates(subset='ITL321CD', keep=False, inplace=True)
print(lads_df.shape)
lads_df.to_csv(os.path.join(root, 'geographic_data_cleaned_unique.csv'), index=False)      # Contains lads that both LAD and ITL can be unique matched

(92, 4)


## Mapping GDP data with geographic data

In [8]:
gdp_ITL3 = getDataByITL(data=loadRawData('Table 9'), ITL='ITL3', startYear=1999, endYear=2022, isOriginalFormat=False)

getDataByITL shape: (4296, 5)


In [9]:
ITL3_lads = gdp_ITL3['ITL code'].unique()
lads_list = lads_df.groupby('ITL321CD').count()
print(f"number of ITL code in economic data: {len(ITL3_lads)}\nnumber of ITL code in LAD mapping data: {len(lads_list)}")
ifExist = pd.Series(ITL3_lads).isin(lads_list.index)
existItems = ITL3_lads[ifExist]
print(f'number of ITL code in economic data that can be mapped to LAD: {ifExist.sum()}')

number of ITL code in economic data: 179
number of ITL code in LAD mapping data: 92
number of ITL code in economic data that can be mapped to LAD: 92


So, all 92 ITL3 areas have unique ITL code and LAD code. We mainly focus on those areas.

In [10]:
# map LAD to economic data
gdp_ITL3_LDA = pd.merge(gdp_ITL3, lads_df, left_on='ITL code', right_on='ITL321CD', how='inner')
print(gdp_ITL3_LDA.shape)
gdp_ITL3_LDA.head()

(2208, 9)


Unnamed: 0,Region Name,ITL,ITL code,year,value,LAD23CD,LAD23NM,ITL321CD,ITL321NM
0,Darlington,ITL3,TLC13,1999,161.8,E06000005,Darlington,TLC13,Darlington
1,Darlington,ITL3,TLC13,2000,151.4,E06000005,Darlington,TLC13,Darlington
2,Darlington,ITL3,TLC13,2001,144.6,E06000005,Darlington,TLC13,Darlington
3,Darlington,ITL3,TLC13,2002,141.1,E06000005,Darlington,TLC13,Darlington
4,Darlington,ITL3,TLC13,2003,136.6,E06000005,Darlington,TLC13,Darlington


In [11]:
# check null value
gdp_ITL3_LDA.isna().any()

Region Name    False
ITL            False
ITL code       False
year           False
value          False
LAD23CD        False
LAD23NM        False
ITL321CD       False
ITL321NM       False
dtype: bool

In [12]:
# check duplicated value
gdp_ITL3_LDA[gdp_ITL3_LDA[['LAD23CD', 'year']].duplicated()]

Unnamed: 0,Region Name,ITL,ITL code,year,value,LAD23CD,LAD23NM,ITL321CD,ITL321NM


In [13]:
# save data
gdp_ITL3_LDA.to_csv(os.path.join(root, 'GDP_ITL3_LDA(raw).csv'), index=False)       # ITL3 GDP data