In [1]:
import pandas as pd  # Importing pandas for data manipulation and analysis
import numpy as np  # Importing NumPy for numerical operations and array handling
from datetime import datetime  # Importing datetime module for working with dates and times
import os  # Importing the os module to interact with the operating system
import math  # Importing the math module to access mathematical functions
import logging  # Importing logging module for event tracking and debugging
import warnings  # Importing warnings module to control warning messages

# To Suppress all warnings
warnings.filterwarnings("ignore")


In [2]:

notebook_name = 'Data Extraction' 

# Paths for the log directories
info_log_path = f'../Logs/info/{notebook_name}_info.log'

# Creating directories if they don't exist
os.makedirs(os.path.dirname(info_log_path), exist_ok=True)

# Clearing any previous handlers if re-running this setup
logger = logging.getLogger()
while logger.handlers:
    logger.handlers.pop()

# Configuring logging
info_logger = logging.getLogger('info_logger')

info_handler = logging.FileHandler(info_log_path, mode='a')  # Append mode

info_handler.setLevel(logging.INFO)

# Consistent formatter for both handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
info_handler.setFormatter(formatter)

# Adding handlers to the loggers
info_logger.addHandler(info_handler)

info_logger.setLevel(logging.INFO)


In [3]:

# Setting the base directory for input files
BASE_DIR = '../Data/Input/'

def read_and_filter_file(file_path, sheet_name=None, skiprows=None, header='infer', usecols=None, nrows=None, filter_col=None, filter_value=None, exclude_row=None, rename_columns=None):
    try:
        # Combining base directory with file path
        file_path = os.path.join(BASE_DIR, file_path)
        
        print(f"\nReading file: {file_path}, sheet: {sheet_name if sheet_name else 'N/A'}")
        info_logger.info(f"\nReading file: {file_path}")
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path, skiprows=skiprows, header=header, usecols=usecols, nrows=nrows)
        elif file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=skiprows, header=header, usecols=usecols, nrows=nrows)
        elif file_path.endswith('.xls'):
            df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=skiprows, header=header, usecols=usecols, nrows=nrows, engine='xlrd')
        elif file_path.endswith('.ods'):
            df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=skiprows, header=header, usecols=usecols, nrows=nrows, engine='odf')
        else:
            raise ValueError("Unsupported file format")

        # Adding the extraction timestamp
        df['extraction_timestamp'] = datetime.now()

        # Renaming columns
        if rename_columns:
            df.rename(columns=rename_columns, inplace=True)
            
        # Dropping the specific row to exclude
        if exclude_row is not None:
            index_to_exclude = exclude_row - skiprows - 1
            df = df.drop(df.index[index_to_exclude])

        # Applying specified filter
        if filter_col is not None and filter_value is not None:
            if isinstance(filter_col, int):
                filter_column_name = df.columns[filter_col]
                df = df[df[filter_column_name].astype(str).str.contains(filter_value, na=False)]
            else:
                df = df[df[filter_col].astype(str).str.contains(filter_value, na=False)]

        return df
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

In [4]:
## Reading all input files from different formats
file_configs = [
    {
        'file_path': 'pp-2023.csv',
        'sheet_name': None,
        'skiprows': 0,
        'header': None,
        'usecols': None,
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'column_names': [ 'Transaction unique identifier', 'Price', 'Date of Transfer', 'Postcode',
            'Property Type', 'Old/New', 'Duration', 'PAON', 'SAON', 'Street', 'Locality', 'Town/City',
            'District', 'County', 'PPD Category Type', 'Record Status', 'extraction_timestamp'  ],
        'rename_columns': None
    },
    {
        'file_path': 'UK-HPI-full-file-2023-12.csv',
        'sheet_name': None,
        'skiprows': 0,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': 'Date',
        'filter_value': '2023',
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload_rent_region.xlsx',
        'sheet_name': 0,
        'skiprows': 7,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': 'Time period',
        'filter_value': '23',
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload_rent_bedroom.xlsx',
        'sheet_name': 0,
        'skiprows': 7,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'mye22tablesew2023geogsv2-UK_population.xlsx',
        'sheet_name': 'MYE2 - Persons',
        'skiprows': 7,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'mye22tablesew2023geogsv2-UK_population.xlsx',
        'sheet_name': 'MYE2 - Females',
        'skiprows': 7,
        'header': 0,
        'usecols':  ['Code',
                    'Name',
                    'Geography',
                    'All ages'],
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'mye22tablesew2023geogsv2-UK_population.xlsx',
        'sheet_name': 'MYE2 - Males',
        'skiprows': 7,
        'header': 0,
        'usecols':  ['Code',
                    'Name',
                    'Geography',
                    'All ages'],
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'mye22tablesew2023geogsv2-UK_population.xlsx',
        'sheet_name': 'MYE5',
        'skiprows': 7,
        'header': 0,
        'usecols':  ['Code',
                    'Name',
                    'Geography',
                    'Area (sq km)'],
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload_percent.xlsx',
        'sheet_name': 0,
        'skiprows': 5,
        'header': 0,
        'usecols':  ['LA code',
                    'LA name',
                    'Percentage change'],
        'nrows': 332,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'TS006-2021-4_population_density.xlsx',
        'sheet_name': 'Dataset',
        'skiprows': 0,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload_population_ethinicty.xlsx',
        'sheet_name': 0,
        'skiprows': 4,
        'header': 0,
        'usecols': 'A:U',
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload_population_qualification_rowwise.xlsx',
        'sheet_name': 0,
        'skiprows': 11,
        'header': 0,
        'usecols': 'A,B,C,D,F,G,H,I,J,K,L',
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'TS067-2021-3_population_qualification_colwise.xlsx',
        'sheet_name': 'Dataset',
        'skiprows': 0,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'TS068-2021-3_population_student_not.xlsx',
        'sheet_name': 'Dataset',
        'skiprows': 0,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
        {
        'file_path': 'datadownload.xlsx',
        'sheet_name': 'Ethnicity',
        'skiprows': 5,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': 'A,B,C,D,E,G,I,K,M,O,Q',
        'filter_value': None,
        'exclude_row': 6,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload.xlsx',
        'sheet_name': 'Unemployment & Employment',
        'skiprows': 4,
        'header': 0,
        'usecols': 'A,B,C,D,E,J,M',
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload.xlsx',
        'sheet_name': 'Deprivation',
        'skiprows': 5,
        'header': 0,
        'usecols': None,
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'datadownload.xlsx',
        'sheet_name': 'Households with children',
        'skiprows': 6,
        'header': 0,
        'usecols': 'A,B,C,D,E',
        'nrows': None,
        'filter_col': None,
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'neast_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393,  
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 , 
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'nwest_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'ykhu_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'emids_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'wmids_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'east_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'lon_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'seast_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'regionaltable1s_employment_count_area.xlsx',
        'sheet_name': 'swest_p',
        'skiprows': 7,
        'header': 0,
        'usecols': [0, 3, 4], 
        'nrows': 393, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': 9 ,
        'rename_columns':  {'Unnamed: 0': 'Year'}
    },
    {
        'file_path': 'datadownload_GDHI.xlsx',
        'sheet_name': 0, 
        'skiprows': 2, 
        'header': 0,  
        'usecols': [0, 1, 26],  
        'nrows': None, 
        'filter_col': None, 
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': '3a88ffc6_inflation_uk_1.csv',
        'sheet_name': 0, 
        'skiprows': 3, 
        'header': 0, 
        'usecols': None, 
        'nrows': None, 
        'filter_col': 'Year', 
        'filter_value': '2023',
        'exclude_row': None,
        'rename_columns': {'Unnamed: 0': 'Year' , 'Unnamed: 1': 'Month'}
    },
    {
        'file_path': 'data-school-pupils-and-their-characteristics.csv',
        'sheet_name': 0, 
        'skiprows': 0, 
        'header': 0,  
        'usecols': None,  
        'nrows': None,  
        'filter_col': None, 
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'Sub-National_Road_Transport_Fuel_Consumption_Tables_2005-2022.xlsx',
        'sheet_name': '2022',  
        'skiprows': 3, 
        'header': 0, 
        'usecols': 'A,B,C,G,K,O,T,W,AA,AE,AI,AJ,AK,AL', 
        'nrows': None, 
        'filter_col': None, 
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'region_traffic_by_vehicle_type_1993_2023.csv',
        'sheet_name': None, 
        'skiprows': 0, 
        'header': 0, 
        'usecols': None,  
        'nrows': None,  
        'filter_col': 'year', 
        'filter_value': '2023',
        'exclude_row': None,
        'rename_columns': None  
    },
    {
        'file_path': 'local_authority_traffic_1993_2023.csv',
        'sheet_name': None, 
        'skiprows': 0, 
        'header': 0,
        'usecols': None,  
        'nrows': None,  
        'filter_col': 'year',  
        'filter_value': '2023',
        'exclude_row': None,
        'rename_columns': None 
    },
    {
        'file_path': 'bus01_travel_count_2023.ods',
        'sheet_name': 'BUS01e', 
        'skiprows': 8, 
        'header': 0,  
        'usecols': [0, 1, 2], 
        'nrows': None, 
        'filter_col': None, 
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': {0: 'Local Authority (LA) Code', 1: 'LA/ Region', 2: '2023'} 
    },
    {
        'file_path': 'rai0201_peak_hours_count.ods',
        'sheet_name': 'RAI0201',
        'skiprows': 4,  
        'header': 0,  
        'usecols': [    'Year',
                        'City or London by station [note 9]',
                        'AM peak arrivals (07:00 to 09:59), number of services',
                        'AM peak arrivals (07:00 to 09:59), total seats [note 2]',
                        'AM peak arrivals (07:00 to 09:59), passengers [note 2]',
                        'All day arrivals, number of services',
                        'All day arrivals, total seats [note 2]',
                        'All day arrivals, passengers [note 2]',
                        'PM peak departures (16:00 to 18:59), number of services',
                        'PM peak departures (16:00 to 18:59), total seats [note 2]',
                        'PM peak departures (16:00 to 18:59) passengers [note 2]',
                        'All day departures, number of services',
                        'All day departures, total seats [note 2]',
                        'All day departures, passengers [note 2]'
                   ],  
        'nrows': None,  
        'filter_col': 'Year',
        'filter_value': '2022',
        'exclude_row': None,
        'rename_columns': None 
    },
    {
        'file_path': 'hospital-records_2023.xls',
        'sheet_name': 'Hospital - Union Catalogue',
        'skiprows': 0, 
        'header': 0,  
        'usecols': 'A:J',  
        'nrows': None,  
        'filter_col': None, 
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None
    },
    {
        'file_path': 'data-education-and-training-statistics-for-the-uk_2023.csv',
        'sheet_name': None,  
        'skiprows': None,
        'header': 0,  
        'usecols': None,
        'nrows': None, 
        'filter_col': None,  
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None  
    },
    {
        'file_path': 'subnationalestimatesofdwellingstockbytenure.xls',
        'sheet_name': 'Table 2a', 
        'skiprows': 5,
        'header': 0, 
        'usecols': ['Region code', 
                    'Region name', 
                    'Local authority code ', 
                    'Local authority name' ],  
        'nrows': 326, 
        'filter_col': None, 
        'filter_value': None,
        'exclude_row': None,
        'rename_columns': None  
    }
]

# Reading all files into separate dataframes and storing them in a list
dataframes = []
for config in file_configs:
    try:
        df = read_and_filter_file(
            file_path=config['file_path'],
            sheet_name=config.get('sheet_name'),
            skiprows=config.get('skiprows'),
            header=config.get('header'),
            usecols=config.get('usecols'),
            nrows=config.get('nrows'),
            filter_col=config.get('filter_col'),
            filter_value=config.get('filter_value'),
            exclude_row=config.get('exclude_row'),
            rename_columns=config.get('rename_columns')
        )
        if df is not None and 'column_names' in config:
            df.columns = config['column_names']
        dataframes.append(df)
        

    except Exception as e:
        print(f"Error reading file {config['file_path']}: {e}")


Reading file: ../Data/Input/pp-2023.csv, sheet: N/A



Reading file: ../Data/Input/UK-HPI-full-file-2023-12.csv, sheet: N/A



Reading file: ../Data/Input/datadownload_rent_region.xlsx, sheet: N/A



Reading file: ../Data/Input/datadownload_rent_bedroom.xlsx, sheet: N/A

Reading file: ../Data/Input/mye22tablesew2023geogsv2-UK_population.xlsx, sheet: MYE2 - Persons



Reading file: ../Data/Input/mye22tablesew2023geogsv2-UK_population.xlsx, sheet: MYE2 - Females



Reading file: ../Data/Input/mye22tablesew2023geogsv2-UK_population.xlsx, sheet: MYE2 - Males



Reading file: ../Data/Input/mye22tablesew2023geogsv2-UK_population.xlsx, sheet: MYE5



Reading file: ../Data/Input/datadownload_percent.xlsx, sheet: N/A

Reading file: ../Data/Input/TS006-2021-4_population_density.xlsx, sheet: Dataset



Reading file: ../Data/Input/datadownload_population_ethinicty.xlsx, sheet: N/A



Reading file: ../Data/Input/datadownload_population_qualification_rowwise.xlsx, sheet: N/A

Reading file: ../Data/Input/TS067-2021-3_population_qualification_colwise.xlsx, sheet: Dataset



Reading file: ../Data/Input/TS068-2021-3_population_student_not.xlsx, sheet: Dataset

Reading file: ../Data/Input/datadownload.xlsx, sheet: Ethnicity



Reading file: ../Data/Input/datadownload.xlsx, sheet: Unemployment & Employment

Reading file: ../Data/Input/datadownload.xlsx, sheet: Deprivation



Reading file: ../Data/Input/datadownload.xlsx, sheet: Households with children

Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: neast_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: nwest_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: ykhu_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: emids_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: wmids_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: east_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: lon_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: seast_p



Reading file: ../Data/Input/regionaltable1s_employment_count_area.xlsx, sheet: swest_p



Reading file: ../Data/Input/datadownload_GDHI.xlsx, sheet: N/A



Reading file: ../Data/Input/3a88ffc6_inflation_uk_1.csv, sheet: N/A

Reading file: ../Data/Input/data-school-pupils-and-their-characteristics.csv, sheet: N/A

Reading file: ../Data/Input/Sub-National_Road_Transport_Fuel_Consumption_Tables_2005-2022.xlsx, sheet: 2022



Reading file: ../Data/Input/region_traffic_by_vehicle_type_1993_2023.csv, sheet: N/A

Reading file: ../Data/Input/local_authority_traffic_1993_2023.csv, sheet: N/A

Reading file: ../Data/Input/bus01_travel_count_2023.ods, sheet: BUS01e



Reading file: ../Data/Input/rai0201_peak_hours_count.ods, sheet: RAI0201



Reading file: ../Data/Input/hospital-records_2023.xls, sheet: Hospital - Union Catalogue



Reading file: ../Data/Input/data-education-and-training-statistics-for-the-uk_2023.csv, sheet: N/A

Reading file: ../Data/Input/subnationalestimatesofdwellingstockbytenure.xls, sheet: Table 2a


In [5]:
# to Display the dataframes
for i, df in enumerate(dataframes):
    if df is not None:
        print(f"DataFrame {i+1} for file {file_configs[i]['file_path']}:")
        print(df.head())  # Print the first few rows of each dataframe
        print(df.shape)
    else:
        print(f"DataFrame {i+1} for file {file_configs[i]['file_path']} could not be loaded.")

DataFrame 1 for file pp-2023.csv:
            Transaction unique identifier   Price  Date of Transfer  Postcode  \
0  {FFA361DA-C01A-8A03-E053-4804A8C01F61}  270000  2023-06-01 00:00   SS2 4AX   
1  {FFA361DA-C01B-8A03-E053-4804A8C01F61}  372500  2023-05-03 00:00  CM15 9DJ   
2  {FFA361DA-C01D-8A03-E053-4804A8C01F61}  460000  2023-05-19 00:00   CM3 5JZ   
3  {FFA361DA-C01E-8A03-E053-4804A8C01F61}  315000  2023-01-20 00:00   SS9 4BA   
4  {FFA361DA-C01F-8A03-E053-4804A8C01F61}  970000  2023-05-26 00:00   RM2 6NH   

  Property Type Old/New Duration PAON SAON           Street  \
0             S       N        F    6  NaN  WEYBOURNE CLOSE   
1             S       N        F  138  NaN       ONGAR ROAD   
2             D       N        F   21  NaN     RODING LEIGH   
3             S       N        F    8  NaN    TIPTREE CLOSE   
4             D       N        F   67  NaN     LINKS AVENUE   

                Locality        Town/City         District           County  \
0                    

#### Combining above Dataframes for consitency and same granularity for further processing

Appending additional Regions and Local authority information in lookup table -Table 1

In [6]:

# Adding Additional local authority information that is consolidated from other files
additional_info = {
    'Region code': ['E12000009', 'E12000009', 'E12000008', 'E12000004', 'E12000004', 
                    'E12000002', 'E12000002', 'E12000003', 'E12000009', 'E12000006',
                    'E12000006', 'E10000003', 'E10000007', 'E10000008', 'E10000011', 
                    'E10000012', 'E10000013', 'E10000014', 'E10000015', 'E10000016', 
                    'E10000017', 'E10000018', 'E10000019', 'E10000020', 'E10000024', 
                    'E10000025', 'E10000028', 'E10000029', 'E10000030', 'E10000031', 
                    'E10000032', 'E10000034', 'E11000001', 'E11000002', 'E11000003', 
                    'E11000005', 'E11000006', 'E11000007'],
    'Region name': ['South West', 'South West', 'South East', 'East Midlands', 'East Midlands', 
                    'North West', 'North West', 'Yorkshire and The Humber', 'South West', 
                    'East of England', 'East of England', 'East of England', 'East of England',
                    'East Midlands', 'South West', 'South East', 'East of England', 'South West',
                    'South East', 'East of England', 'South East', 'North West', 'East Midlands',
                    'East Midlands', 'East of England', 'East Midlands', 'South East', 'West Midlands',
                    'East of England', 'South East', 'West Midlands', 'South East', 'West Midlands', 
                    'North West', 'Yorkshire and The Humber', 'West Midlands', 'Yorkshire and The Humber', 
                    'North East'],
    'Local authority code ': ['E06000058', 'E06000059', 'E06000060', 'E06000061', 'E06000062', 
                             'E06000063', 'E06000064', 'E06000065', 'E06000066', 'E07000244',
                             'E07000245', 'E10000003', 'E10000007', 'E10000008', 'E10000011',
                             'E10000012', 'E10000013', 'E10000014', 'E10000015', 'E10000016',
                             'E10000017', 'E10000018', 'E10000019', 'E10000020', 'E10000024',
                             'E10000025', 'E10000028', 'E10000029', 'E10000030', 'E10000031',
                             'E10000032', 'E10000034', 'E11000001', 'E11000002', 'E11000003',
                             'E11000005', 'E11000006', 'E11000007'],
    'Local authority name': ['Bournemouth, Christchurch and Poole', 'Dorset', 'Buckinghamshire', 
                             'North Northamptonshire', 'West Northamptonshire', 'Cumberland',
                             'Westmorland and Furness', 'North Yorkshire', 'Somerset', 'East Suffolk',
                             'West Suffolk', 'Cambridgeshire', 'Derbyshire', 'Devon', 'East Sussex',
                             'Essex', 'Gloucestershire', 'Hampshire', 'Hertfordshire', 'Kent',
                             'Lancashire', 'Leicestershire', 'Lincolnshire', 'Norfolk', 'Nottinghamshire',
                             'Oxfordshire', 'Staffordshire', 'Suffolk', 'Surrey', 'Warwickshire',
                             'West Sussex', 'Worcestershire', 'Greater Manchester', 'Merseyside',
                             'South Yorkshire', 'West Midlands', 'West Yorkshire', 'Tyne and Wear']
}

df_additional = pd.DataFrame(additional_info)

# Concatenate with df38
regioncodes = pd.concat([dataframes[37], df_additional], ignore_index=True)
info_logger.info("Appended additional local authority codes into existing data")

regioncodes.head()

Unnamed: 0,Region code,Region name,Local authority code,Local authority name,extraction_timestamp
0,E12000001,North East,E06000047,County Durham,2024-09-12 12:57:54.473735
1,E12000001,North East,E06000005,Darlington,2024-09-12 12:57:54.473735
2,E12000001,North East,E06000001,Hartlepool,2024-09-12 12:57:54.473735
3,E12000001,North East,E06000002,Middlesbrough,2024-09-12 12:57:54.473735
4,E12000001,North East,E06000057,Northumberland,2024-09-12 12:57:54.473735


#### Performing basic data cleaning and processing in Table 2

In [7]:
pph_before = dataframes[0]

# Sorting the DataFrame by 'Town/City' and then 'Date of Transfer'
sorted_df = pph_before.sort_values(by=['Town/City', 'Date of Transfer'])
sorted_df.head()

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,Postcode,Property Type,Old/New,Duration,PAON,SAON,Street,Locality,Town/City,District,County,PPD Category Type,Record Status,extraction_timestamp
552131,{F3B6C198-2800-6E40-E053-6C04A8C0B3B4},210000,2023-01-06 00:00,WD5 0NN,T,N,F,18,,MARGARET CLOSE,,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,A,A,2024-09-12 12:57:41.392405
624437,{F87E72F8-F1EE-176C-E053-6B04A8C0D2BE},438250,2023-01-16 00:00,WD5 0NN,S,N,F,3,,MARGARET CLOSE,,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,A,A,2024-09-12 12:57:41.392405
330130,{FAC30767-210C-5E20-E053-4704A8C004EE},674500,2023-01-19 00:00,WD5 0LP,D,N,F,42,,FOLLETT DRIVE,,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,A,A,2024-09-12 12:57:41.392405
559851,{F5E8B081-4295-3A13-E053-6C04A8C060B7},412500,2023-01-19 00:00,WD5 0NR,T,N,F,13,,COLLEGE ROAD,,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,A,A,2024-09-12 12:57:41.392405
629175,{F87E72F8-F3DA-176C-E053-6B04A8C0D2BE},1100000,2023-01-25 00:00,WD5 0AY,D,N,F,31,,ABBOTS ROAD,,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,A,A,2024-09-12 12:57:41.392405


In [8]:

df = dataframes[0]

# Removing unnecessary columns
df_cleaned = df.drop(columns=['Transaction unique identifier', 'Postcode', 'Street', 'Locality', 'PAON', 'SAON'])

# Converting 'Date of Transfer' to month-year format
df_cleaned['Date of Transfer'] = pd.to_datetime(df_cleaned['Date of Transfer'])
df_cleaned['Transfer Month-Year'] = df_cleaned['Date of Transfer'].dt.strftime('%b-%Y')

# Function to calculate mode
def mode(series):
    return series.mode().iloc[0] if not series.mode().empty else None

# Performing aggregation
aggregated_df = df_cleaned.groupby(['Town/City', 'District', 'County', 'Transfer Month-Year']).agg({
    'Price': 'sum',
    'Property Type': mode,
    'Old/New': mode,
    'Duration': mode,
    'PPD Category Type': mode,
    'Record Status': mode,
    'extraction_timestamp': 'max'  # to fetch the max timestamp for each group
}).reset_index()

# Renaming the aggregated columns for clarity
aggregated_df.rename(columns={
    'Property Type': 'Property Type',
    'Old/New': 'Old/New',
    'Duration': 'Duration',
    'PPD Category Type': 'PPD Category Type',
    'Record Status': 'Record Status'
}, inplace=True)

# Saving the result to a new DataFrame
pricepaidhouse = aggregated_df

# Displaying the new DataFrame
pricepaidhouse.head()

Unnamed: 0,Town/City,District,County,Transfer Month-Year,Price,Property Type,Old/New,Duration,PPD Category Type,Record Status,extraction_timestamp
0,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,Apr-2023,5772000,T,N,F,A,A,2024-09-12 12:57:41.392405
1,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,Aug-2023,5825000,D,N,F,A,A,2024-09-12 12:57:41.392405
2,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,Dec-2023,2333500,T,N,F,A,A,2024-09-12 12:57:41.392405
3,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,Feb-2023,2688250,F,N,L,A,A,2024-09-12 12:57:41.392405
4,ABBOTS LANGLEY,THREE RIVERS,HERTFORDSHIRE,Jan-2023,3245250,T,N,F,A,A,2024-09-12 12:57:41.392405


##### Processing and Combining Table 1 and table 2

In [9]:

# function to apply transformations
def preprocess_columns(df, columns):
    for column in columns:
        df[column] = df[column].str.upper().str.strip()
    return df

# Preprocessing columns in pricepaidhouse
pricepaidhouse = preprocess_columns(pricepaidhouse, ['District', 'Town/City', 'County'])

# Preprocessing columns in df38/regioncodes
regioncodes = preprocess_columns(regioncodes, ['Local authority name'])

# Mapping specific Distrcit names to ensure consistency
mapping = {
    'AYLESBURY': 'AYLESBURY VALE',
    'CITY OF BRISTOL': 'BRISTOL',
    'BRISTOL': 'BRISTOL, CITY OF',
    'LONDON': 'CITY OF LONDON',
    'CITY OF PLYMOUTH': 'PLYMOUTH',
    'CITY OF PETERBOROUGH': 'PETERBOROUGH',
    'CITY OF NOTTINGHAM':'NOTTINGHAM',
    'CITY OF WESTMINSTER': 'WESTMINSTER',
    'CITY OF DERBY': 'DERBY',
    'HEREFORDSHIRE': 'HEREFORDSHIRE, COUNTY OF',
    'KINGSTON UPON HULL': 'KINGSTON UPON HULL, CITY OF',    
    'CITY OF KINGSTON UPON HULL': 'KINGSTON UPON HULL, CITY OF',
    'ST HELENS': 'ST. HELENS',
    'TELFORD , WREKIN': 'TELFORD AND WREKIN',
    'WEYMOUTH , PORTLAND': 'WEYMOUTH AND PORTLAND'
}

# Replacing District values based on mapping
pricepaidhouse['District'] = pricepaidhouse['District'].replace(mapping)

# Firstly, merging on 'District'
merged_df = pd.merge(regioncodes, pricepaidhouse, left_on='Local authority name', right_on='District', how='outer')

# Checking which rows did not get a match on 'District'
unmatched_df = merged_df[merged_df['District'].isnull()]

# Merging unmatched rows on 'Town/City'
merged_df_2 = pd.merge(unmatched_df.drop(columns=['Price', 'Property Type', 'Old/New', 'Duration', 'PPD Category Type', 'Record Status', 'Local authority code ', 'District']), 
                               pricepaidhouse, left_on='Local authority name', right_on='Town/City', how='outer')


# Checking which rows still did not get a match on 'District' and 'Town/City'
unmatched_df_2 = merged_df[merged_df['District'].isnull()]

# Merging unmatched rows on 'County'
merged_df_3 = pd.merge(unmatched_df_2.drop(columns=['Price', 'Property Type', 'Old/New', 'Duration', 'PPD Category Type', 'Record Status', 'Local authority code ', 'District']), 
                       pricepaidhouse, left_on='Local authority name', right_on='County', how='outer')

# Updating the merged_df with matches from 'County'
merged_df.loc[merged_df['District'].isnull(), 'District'] = merged_df_3['District']

# Identifying the maximum extraction timestamp between the two columns
merged_df['extraction_timestamp'] = merged_df[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the duplicate extraction timestamp columns
merged_df.drop(columns=['extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)
info_logger.info("Combined table 1-pricepaidhouse and table 2-regioncodes")

# To display the final dataframe
print(merged_df.head())

  Region code Region name Local authority code  Local authority name  \
0   E12000008  South East             E07000223                 ADUR   
1   E12000008  South East             E07000223                 ADUR   
2   E12000008  South East             E07000223                 ADUR   
3   E12000008  South East             E07000223                 ADUR   
4   E12000008  South East             E07000223                 ADUR   

  Town/City District       County Transfer Month-Year      Price  \
0  BRIGHTON     ADUR  WEST SUSSEX            Apr-2023  1604250.0   
1  BRIGHTON     ADUR  WEST SUSSEX            Aug-2023  5921500.0   
2  BRIGHTON     ADUR  WEST SUSSEX            Dec-2023  3973000.0   
3  BRIGHTON     ADUR  WEST SUSSEX            Feb-2023  4325000.0   
4  BRIGHTON     ADUR  WEST SUSSEX            Jan-2023  4180500.0   

  Property Type Old/New Duration PPD Category Type Record Status  \
0             S       N        F                 A             A   
1             S       

##### Form granularity in the base combined table(1,2) at a level of Month and Local authoirties 

In [10]:

# Custom mode function
def mode_function(series):
    if series.isnull().all():
        return np.nan
    else:
        return series.mode().iloc[0] if not series.mode().empty else np.nan

# Splitting the DataFrame
df_with_price = merged_df[merged_df['Price'].notna()]
df_without_price = merged_df[merged_df['Price'].isna()]

# Performing aggregation on the part with valid 'Price' values
df_with_price_agg = df_with_price.groupby(['District', 'Transfer Month-Year']).agg({
    'Town/City': mode_function,
    'County': mode_function,
    'Price': 'mean',
    'Property Type': mode_function,
    'Old/New': mode_function,
    'Duration': mode_function,
    'PPD Category Type': mode_function,
    'Record Status': mode_function,
    'Region code': mode_function,
    'Region name': mode_function,
    'Local authority code ': mode_function,
    'Local authority name': mode_function,
    'extraction_timestamp': 'max'
}).reset_index()

# Generating month-year combinations for missing entries
date_range = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y')

new_rows = []
for _, row in df_without_price.iterrows():
    for month_year in date_range:
        new_row = row.copy()
        new_row['Transfer Month-Year'] = month_year
        new_rows.append(new_row)

df_new = pd.DataFrame(new_rows)

# Combining the aggregated DataFrame with the part that has NaN 'Price' values
merged_df_pph = pd.concat([df_with_price_agg, df_new], ignore_index=True)

# To Display the final DataFrame
print(merged_df_pph.head())


  District Transfer Month-Year Town/City       County         Price  \
0     ADUR            Apr-2023  BRIGHTON  WEST SUSSEX  6.539251e+06   
1     ADUR            Aug-2023  BRIGHTON  WEST SUSSEX  1.005653e+07   
2     ADUR            Dec-2023  BRIGHTON  WEST SUSSEX  6.319333e+06   
3     ADUR            Feb-2023  BRIGHTON  WEST SUSSEX  8.030573e+06   
4     ADUR            Jan-2023  BRIGHTON  WEST SUSSEX  8.335137e+06   

  Property Type Old/New Duration PPD Category Type Record Status Region code  \
0             F       N        F                 A             A   E12000008   
1             S       N        F                 A             A   E12000008   
2             S       N        F                 A             A   E12000008   
3             S       N        F                 A             A   E12000008   
4             D       N        F                 A             A   E12000008   

  Region name Local authority code  Local authority name  \
0  South East             E07000

##### Processing Table 3 and merge with Table 1, 2

In [11]:
df = dataframes[1]

# Defining columns to retain
columns_to_retain = [
    'Date', 'RegionName', 'AreaCode', 'AveragePrice', 'Index', '1m%Change', 'SalesVolume',
    'DetachedPrice', 'SemiDetachedPrice', 'TerracedPrice', 'FlatPrice',
    'CashPrice', 'MortgagePrice', 'MortgageIndex', 'FTBPrice', 'FOOPrice',
    'NewPrice', 'NewSalesVolume', 'OldPrice', 'OldSalesVolume', 'extraction_timestamp'
]

# Filtering the dataframe
df_filtered = df[columns_to_retain]

# Converting the 'Date' column to datetime format and then to the desired format 'MMM-YY'
df_filtered['Date'] = pd.to_datetime(df_filtered['Date'], format='%d/%m/%Y').dt.strftime('%b-%Y')

# Renaming the filtered dataframe to HPI
HPI = df_filtered

# Joining merged_df_pph to HPI
combined_df = pd.merge(
    merged_df_pph,
    HPI,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['AreaCode', 'Date'],
    how='outer'
)
# creating new 'extraction_timestamp' column with the maximum timestamp from the two columns
combined_df['extraction_timestamp'] = combined_df[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the duplicate extraction timestamp columns
combined_df.drop(columns=['extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

# Retaining specified columns, removing other columns as they don't add any value for future analysis or contain the same value
columns_to_retain = ['District', 'Transfer Month-Year', 'Town/City', 'County', 'Price',
       'Property Type', 'Old/New', 'Duration', 'PPD Category Type',
       'Record Status', 'Region code', 'Region name', 'Local authority code ',
       'Local authority name', 'Date', 'RegionName',
       'AreaCode', 'AveragePrice', 'Index', '1m%Change', 'SalesVolume',
       'DetachedPrice', 'SemiDetachedPrice', 'TerracedPrice', 'FlatPrice',
       'CashPrice', 'MortgagePrice', 'MortgageIndex', 'FTBPrice', 'FOOPrice',
       'NewPrice', 'NewSalesVolume', 'OldPrice', 'OldSalesVolume', 'extraction_timestamp']

combined_df = combined_df[columns_to_retain]

info_logger.info("Combined table 1 and 2 with table -3 HPI")

# To Display the combined dataframe
print(combined_df.head())

     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  CashPrice MortgagePrice MortgageIndex  FTBPrice  FOOPrice  NewPrice  \
0  105918.0      134626.0         126.3  104662.0

##### Process Table 4 and merge with Table -1,2,3

In [12]:
# Copying the records to a new DataFrame called rent
rent = dataframes[2].copy()

# Transforming the date column to 'MMM-YYYY' format and store it in the same column
rent['Time period'] = pd.to_datetime(rent['Time period']).dt.strftime('%b-%Y')

# Joining combined_df with rent, using a left join
combined_df1 = pd.merge(
    combined_df,
    rent,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Area code', 'Time period'],
    how='left'
)
combined_df1['extraction_timestamp'] = combined_df1[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns
combined_df1.drop(columns=['Area code', 'Time period', 'Area name', 'Region or country name', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)
info_logger.info("Combined table 1, 2 and 3 with table -4 Rent")

# To Display the final dataframe
print(combined_df1.head())

     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  MortgageIndex  FTBPrice  FOOPrice  NewPrice NewSalesVolume  OldPrice  \
0         126.3  104662.0  147534.0  182570.0   

##### Process Table 5 and merge with Table -1,2,3,4

In [13]:
df = dataframes[3]

# Listing months for the currently processed file
months = ['Jan-2023', 'Feb-2023', 'Mar-2023', 'Apr-2023', 'May-2023', 'Jun-2023', 'Jul-2023', 'Aug-2023', 'Sep-2023', 'Oct-2023', 'Nov-2023', 'Dec-2023']

# Creating a new dataframe with repeated rows for each month
BR_rent = df.loc[df.index.repeat(len(months))].reset_index(drop=True)

# Adding the month column
BR_rent['Month-Year'] = months * len(df)

# Joining combined_df with rent, using a left join
combined_df2 = pd.merge(
    combined_df1,
    BR_rent,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Area Code', 'Month-Year'],
    how='left'
)
combined_df2['extraction_timestamp'] = combined_df2[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns
combined_df2.drop(columns=['Area Code', 'Area Name', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

info_logger.info("Combined table 1, 2, 3 and 4 with table -5 Rent Bedrooms")
# To Display the final dataframe
print(combined_df2.head())

     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

   OldPrice OldSalesVolume Annual change (%) Rental price (£) One Bedroom  \
0  116234.0           54.0               1.6 

##### Process Tables 6,7,8,9 to retain required columns and use Table 10 to identify average monthly % change calculated using Annual % change to split the records in required level of granularity (Local Authority level & monthly)

In [14]:
Annual_data = dataframes[8]
Annual_data = Annual_data.sort_values(by=['LA name'])
Annual_data.head()

Unnamed: 0,LA code,LA name,Percentage change,extraction_timestamp
220,E07000223,Adur,5.4,2024-09-12 12:57:45.932833
64,E07000026,Allerdale,-1.3,2024-09-12 12:57:45.932833
70,E07000032,Amber Valley,3.0,2024-09-12 12:57:45.932833
221,E07000224,Arun,10.2,2024-09-12 12:57:45.932833
178,E07000170,Ashfield,5.7,2024-09-12 12:57:45.932833


In [15]:
female_individual_age_data = dataframes[5]
# Renaming the 'All ages' column to 'Female population'
female_individual_age_data = female_individual_age_data.rename(columns={'All ages': 'Female population'})

# Sorting and displaying the relevant columns
female_individual_age_data = female_individual_age_data.sort_values(by=['Name'])
female_individual_age_data[['Code', 'Name', 'Female population']].head()


Unnamed: 0,Code,Name,Female population
297,E07000223,Adur,33471
81,E07000032,Amber Valley,64433
298,E07000224,Arun,86400
106,E07000170,Ashfield,64951
266,E07000105,Ashford,69645


In [16]:
male_individual_age_data = dataframes[6]
# Renaming the 'All ages' column to 'Male population'
male_individual_age_data = male_individual_age_data.rename(columns={'All ages': 'Male population'})

# Sorting and displaying the relevant columns
male_individual_age_data = male_individual_age_data.sort_values(by=['Name'])
male_individual_age_data[['Code', 'Name', 'Male population']].head()


Unnamed: 0,Code,Name,Male population
297,E07000223,Adur,31217
81,E07000032,Amber Valley,62511
298,E07000224,Arun,79966
106,E07000170,Ashfield,62228
266,E07000105,Ashford,65965


In [17]:
df4=dataframes[4]
df5=dataframes[5]
df6=dataframes[6]
df7=dataframes[7]
# Grouping ages in df4
age_columns = [str(i) for i in range(90)] + ['90+']
df4['0-20'] = df4[age_columns[:21]].sum(axis=1)
df4['20-40'] = df4[age_columns[21:41]].sum(axis=1)
df4['40-60'] = df4[age_columns[41:61]].sum(axis=1)
df4['60+'] = df4[age_columns[61:]].sum(axis=1)

# Selecting relevant columns
df4_grouped = df4[['Code', 'Name', 'Geography', 'All ages', '0-20', '20-40', '40-60', '60+', 'extraction_timestamp']]

# Merging male and female population DataFrames (df5 and df6)
df5 = df5.rename(columns={'All ages': 'Female population'})
df6 = df6.rename(columns={'All ages': 'Male population'})

# Merging based on 'Code'
population = df4_grouped.merge(df5[['Code', 'Female population']], on='Code', how='left')
population = population.merge(df6[['Code', 'Male population']], on='Code', how='left')
population = population.merge(df7[['Code', 'Area (sq km)']], on='Code', how='left')

df8_copy = dataframes[8].copy()
pop_copy=population.copy()

# Calculating the average annual percentage change
df8_copy['Annual Percentage Change'] = ((1 + df8_copy['Percentage change'] / 100) ** (1 / 10) - 1) * 100

# Calculating the average monthly percentage change
df8_copy['Monthly Percentage Change'] = ((1 + df8_copy['Annual Percentage Change'] / 100) ** (1 / 12) - 1) * 100

# Getting the maximum extraction timestamp from dataframes[8]
max_extraction_timestamp = df8_copy['extraction_timestamp'].max()

# Dropping the extraction timestamp column from dataframes[8] as it will be re-added later
df8_copy.drop(columns=['extraction_timestamp'], inplace=True)
pop_copy.drop(columns=['extraction_timestamp'], inplace=True)

# Creating month-year column for the year 2023
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()

# Expanding the population dataframe for each month
expanded_population = pd.DataFrame(np.repeat(pop_copy.values, len(months), axis=0), columns=pop_copy.columns)
expanded_population['Month-Year'] = months * len(pop_copy)

# Applying monthly percentage change to population
def apply_monthly_change(row):
    la_code = row['Code']
    
    # Check if the LA code exists in the percentage change dataframe
    if la_code in df8_copy['LA code'].values:
        monthly_change = df8_copy[df8_copy['LA code'] == la_code]['Monthly Percentage Change'].values[0] / 100
        month_index = months.index(row['Month-Year'])
        
        for col in pop_copy.columns[3:-1]:  # All columns from 'All ages' till 'Male population'
            initial_value = row[col]
            adjusted_value = initial_value * ((1 + monthly_change) ** month_index)
            row[col] = round(adjusted_value)
    
    return row

# Applying the function to adjust population columns
df_population = expanded_population.apply(apply_monthly_change, axis=1)

# Reattaching the 'extraction_timestamp' column to df_population
df_population['extraction_timestamp'] = [max_extraction_timestamp] * len(df_population)

# Displaying the final dataframe
print(df_population)


           Code               Name          Geography  All ages      0-20  \
0     K04000001  ENGLAND AND WALES            Country  60238038  14594949   
1     K04000001  ENGLAND AND WALES            Country  60238038  14594949   
2     K04000001  ENGLAND AND WALES            Country  60238038  14594949   
3     K04000001  ENGLAND AND WALES            Country  60238038  14594949   
4     K04000001  ENGLAND AND WALES            Country  60238038  14594949   
...         ...                ...                ...       ...       ...   
4279  W06000022            Newport  Unitary Authority    162363     41314   
4280  W06000022            Newport  Unitary Authority    162486     41345   
4281  W06000022            Newport  Unitary Authority    162609     41377   
4282  W06000022            Newport  Unitary Authority    162732     41408   
4283  W06000022            Newport  Unitary Authority    162855     41439   

         20-40     40-60       60+  Female population  Male population  \
0

In [18]:
new_age_data = df_population
new_age_data = new_age_data.sort_values(by=['Name','Month-Year'])
new_age_data[['Code', 'Name', 'All ages', 'Month-Year']].head(12)

Unnamed: 0,Code,Name,All ages,Month-Year
3567,E07000223,Adur,64773,Apr-2023
3571,E07000223,Adur,64887,Aug-2023
3575,E07000223,Adur,65001,Dec-2023
3565,E07000223,Adur,64716,Feb-2023
3564,E07000223,Adur,64688,Jan-2023
3570,E07000223,Adur,64858,Jul-2023
3569,E07000223,Adur,64830,Jun-2023
3566,E07000223,Adur,64745,Mar-2023
3568,E07000223,Adur,64802,May-2023
3574,E07000223,Adur,64972,Nov-2023


##### Merge files - 6,7,8,9,10 with earlier combined files - 1,2,3,4,5

In [19]:
# Joining combined_df3 with df_population, using a left join
combined_df3 = pd.merge(
    combined_df2,
    df_population,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Code', 'Month-Year'],
    how='left'
)

combined_df3['extraction_timestamp'] = combined_df3[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns
combined_df3.drop(columns=['Code', 'Name', 'Geography', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

info_logger.info("Combined table 1, 2, 3, 4 and 5 with table 6,7,8,9,10 - age")
# Displaying the final dataframe
print(combined_df3.head())

     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  All categories All ages     0-20    20-40    40-60      60+  \
0          534.0  93868.0  22983.0  22662.0  24331.0  238

##### skipping table 11,12
##### Merge and process Table 13 with combined df from Table 1 to 10

In [20]:
# Adding Month-Year column for 12 months
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()
qualification_pop = pd.DataFrame(np.repeat(dataframes[11].values, len(months), axis=0), columns=dataframes[11].columns)
qualification_pop['Month-Year'] = months * len(dataframes[11])

# Merging with combined_df3 using 'Local authority code' and 'Transfer Month-Year'
combined_df4 = pd.merge(
    combined_df3,
    qualification_pop,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Area code', 'Month-Year'],
    how='left'
)

combined_df4['extraction_timestamp'] = combined_df4[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns
combined_df4.drop(columns=['Area code', 'Area name', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

info_logger.info("Combined table 1 to 10 with table 13 - qualification")
# Displaying the final DataFrame
print(combined_df4.head())


     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  Qualification index score  \
0                       2.2   
1                       2.2   
2                       2.2  

skip table 14,15,16,17

##### Merge and process Table 19 with earlier tables 1 to 10,13

In [21]:
# Adding Month-Year column for 12 months
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()
house_child = pd.DataFrame(np.repeat(dataframes[17].values, len(months), axis=0), columns=dataframes[17].columns)
house_child['Month-Year'] = months * len(dataframes[17])

# Merging with combined_df4 using 'Local authority code' and 'Transfer Month-Year'
combined_df5 = pd.merge(
    combined_df4,
    house_child,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Area code', 'Month-Year'],
    how='left'
)

combined_df5['extraction_timestamp'] = combined_df5[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns 
combined_df5.drop(columns=['Region code_y', 'Region name_y', 'Area code', 'Area name ', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

info_logger.info("Combined table 1 to 10, 13 with table 19 - house with child ")
# Displaying the final DataFrame
print(combined_df5.head())


     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  Qualification index rank \n(lowest = 1, highest = 331)  \
0                                                278       
1 

##### Merge and process Table 18 with earlier tables 1 to 10,13,19

In [22]:
# Adding Month-Year column for 12 months
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()
deprivation = pd.DataFrame(np.repeat(dataframes[16].values, len(months), axis=0), columns=dataframes[16].columns)
deprivation['Month-Year'] = months * len(dataframes[16])

# Merging with combined_df5 using 'Local authority code' and 'Transfer Month-Year'
combined_df6 = pd.merge(
    combined_df5,
    deprivation,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Area Code', 'Month-Year'],
    how='left'
)

combined_df6['extraction_timestamp'] = combined_df6[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns 
combined_df6.drop(columns=['Region Code', 'Region Name', 'Area Code', 'Area Name ', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

combined_df6 = combined_df6.rename(columns={'Average Score ': 'Deprivation Average Score'})

info_logger.info("Combined table 1 to 10, 13, 19 with table 18- Deprivation")
# Displaying the final DataFrame
print(combined_df6.head())


     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  No qualifications (number) Level 1 and entry level qualifications (number)  \
0                      17286              

##### Merge and process Table 17, 20 to 28 with earlier tables 1 to 10,13,18,19

In [23]:

regions = {
    'E12000001': 'North East',
    'E12000002': 'North West',
    'E12000003': 'Yorkshire and The Humber',
    'E12000004': 'East Midlands',
    'E12000005': 'West Midlands',
    'E12000006': 'East of England',
    'E12000007': 'London',
    'E12000008': 'South East',
    'E12000009': 'South West'
}

# DataFrames from 18 to 26 in the order corresponding to the regions above
employment_dfs = [dataframes[i] for i in range(18, 27)]

# Function to calculate monthly percentage change
def calculate_monthly_percentage_change(df):
    df['Monthly Employment Change'] = df['Total in employment'].pct_change().fillna(0)
    df['Monthly Unemployment Change'] = df['Unemployed'].pct_change().fillna(0)
    return df

# Calculating the monthly percentage changes for each region
for i, region_code in enumerate(regions.keys()):
    employment_dfs[i] = calculate_monthly_percentage_change(employment_dfs[i])

# Function to expand the dataframe for each month
def expand_dataframe(df, months):
    expanded_df = pd.DataFrame(np.repeat(df.values, len(months), axis=0), columns=df.columns)
    expanded_df['Month-Year'] = months * len(df)
    return expanded_df

# Creating month-year column for the year 2023
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()

# Expanding dataframes[15]
employment_unemployment = expand_dataframe(dataframes[15], months)

# Function to apply monthly changes to local authorities
def apply_monthly_changes(row, employment_df, months):
    month_index = months.index(row['Month-Year'])
    employment_change = employment_df.iloc[month_index]['Monthly Employment Change']
    unemployment_change = employment_df.iloc[month_index]['Monthly Unemployment Change']
    
    row['Number of those aged 16+ in employment who are employees'] = round(row['Number of those aged 16+ in employment who are employees'] * (1 + employment_change))
    row['Number of those aged 16+ who are unemployed'] = round(row['Number of those aged 16+ who are unemployed'] * (1 + unemployment_change))
    row['Number of those aged 16+ in employment who are self-employed'] = round(row['Number of those aged 16+ in employment who are self-employed'] * (1 + employment_change))  
    
    return row

# Applying the changes
for i, region_code in enumerate(regions.keys()):
    region_mask = (employment_unemployment['Region code'] == region_code)
    employment_unemployment.loc[region_mask] = employment_unemployment[region_mask].apply(lambda row: apply_monthly_changes(row.fillna(0), employment_dfs[i], months), axis=1)

# Merging with combined_df2 using 'Local authority code' and 'Transfer Month-Year'
combined_df7 = pd.merge(
    combined_df6,
    employment_unemployment,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Area code', 'Month-Year'],
    how='left'
)

combined_df7['extraction_timestamp'] = combined_df7[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra 'Area code' and 'Time period' columns from rent
combined_df7.drop(columns=['Region code', 'Region name', 'Area code', 'Area name ', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

info_logger.info("Combined table 1 to 10, 13, 18, 19 with table 17, 20 to 28 - Employment/Unemployment")
# Displaying the final DataFrame
print( combined_df7.head())


     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  Apprenticeship (number) Level 3 qualifications (number)  \
0                    5205                           13776   


##### Merge and process Table 29 with earlier tables 1 to 10,13,17 to 28

In [24]:
# Adding Month-Year column for 12 months
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()
GDHI = pd.DataFrame(np.repeat(dataframes[27].values, len(months), axis=0), columns=dataframes[27].columns)
GDHI['Month-Year'] = months * len(dataframes[27])

# Merging with combined_df7 using 'Local authority code' and 'Transfer Month-Year'
combined_df8 = pd.merge(
    combined_df7,
    GDHI,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['LA name', 'Month-Year'],
    how='left'
)

combined_df8['extraction_timestamp'] = combined_df8[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns
combined_df8.drop(columns=['LA code', 'LA name', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

combined_df8 = combined_df8.rename(columns={2021: 'GDHI'})

info_logger.info("Combined table 1 to 10, 13, 17 to 28  with table 29 - GDHI")
# Displaying the final DataFrame
print(combined_df8.head())


     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  Level 3 qualifications (number) Level 4 qualifications and above (number)  \
0                           13776          

##### Process and format table 31 to bring to granularity of month and local authority by calculating yearly change and deriving monthly change from it

In [25]:

columns_to_drop = ['time_identifier', 'old_la_code', 'sex_of_school_description',
                   'type_of_establishment', 'denomination', 'admissions_policy',
                   'urban_rural', 'academy_flag']

school = dataframes[29].drop(columns=columns_to_drop)
school = school.loc[school['phase_type_grouping'] == 'Total']

max_extraction_timestamp_school = school['extraction_timestamp'].max()

# Extracting Year from `time_period`
def extract_year(time_period):
    time_period_str = str(time_period)
    start_year = int(time_period_str[:4])
    return start_year + 1

school['year'] = school['time_period'].apply(extract_year)

# Pivoting the DataFrame and calculate the percentage change for headcount_of_pupils
headcount_pivot = school.pivot_table(index=['region_code', 'region_name', 'new_la_code', 'la_name'], columns='year', values='headcount_of_pupils').reset_index()

if 2024 in headcount_pivot.columns and 2023 in headcount_pivot.columns:
    headcount_pivot['percent_change_headcount'] = ((headcount_pivot[2024] - headcount_pivot[2023]) / headcount_pivot[2023]) * 100
else:
    headcount_pivot['percent_change_headcount'] = float('nan')

# Calculating monthly difference for headcount_of_pupils
headcount_pivot['monthly_difference_headcount'] = headcount_pivot['percent_change_headcount'] / 12

# Pivoting the DataFrame and calculate the percentage change for number_of_schools
school_pivot = school.pivot_table(index=['region_code', 'region_name', 'new_la_code', 'la_name'], columns='year', values='number_of_schools').reset_index()

if 2024 in school_pivot.columns and 2023 in school_pivot.columns:
    school_pivot['percent_change_schools'] = ((school_pivot[2024] - school_pivot[2023]) / school_pivot[2023]) * 100
else:
    school_pivot['percent_change_schools'] = float('nan')

# Calculating monthly difference for number_of_schools
school_pivot['monthly_difference_schools'] = school_pivot['percent_change_schools'] / 12

# Merging the two pivots back together
merged_pivot = pd.merge(headcount_pivot, school_pivot, on=['region_code', 'region_name', 'new_la_code', 'la_name'])

# Function to expand the dataframe for each month
def expand_dataframe(df, months):
    expanded_df = pd.DataFrame(np.repeat(df.values, len(months), axis=0), columns=df.columns)
    expanded_df['Month-Year'] = months * len(df)
    return expanded_df

# Creating month-year column for the year 2024
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()

# Expanding the merged pivot dataframe
expanded_merged_pivot = expand_dataframe(merged_pivot, months)

# Function to apply monthly changes
def apply_monthly_changes(row, months):
    month_index = months.index(row['Month-Year'])
    
    # Applying changes for headcount_of_pupils
    base_value_headcount = row['2023_x'] if pd.notna(row['2023_x']) else 0
    monthly_diff_headcount = row['monthly_difference_headcount'] if pd.notna(row['monthly_difference_headcount']) else 0
    adjusted_value_headcount = base_value_headcount * (1 + (monthly_diff_headcount / 100) * month_index)
    rounded_value_headcount = math.ceil(adjusted_value_headcount)
    
    # Applying changes for number_of_schools
    base_value_schools = row['2023_y'] if pd.notna(row['2023_y']) else 0
    monthly_diff_schools = row['monthly_difference_schools'] if pd.notna(row['monthly_difference_schools']) else 0
    adjusted_value_schools = base_value_schools * (1 + (monthly_diff_schools / 100) * month_index)
    rounded_value_schools = math.ceil(adjusted_value_schools)
    
    row['monthly_headcount_of_pupils'] = rounded_value_headcount
    row['monthly_number_of_schools'] = rounded_value_schools
    
    return row

# Applying the changes
expanded_merged_pivot = expanded_merged_pivot.apply(lambda row: apply_monthly_changes(row, months), axis=1)

# Selecting and rename the final columns
final_df = expanded_merged_pivot[['region_code', 'region_name', 'new_la_code', 'la_name', 'Month-Year', 'monthly_number_of_schools', 'monthly_headcount_of_pupils']]
school_count = final_df.rename(columns={'monthly_headcount_of_pupils': 'Headcount of Pupils', 'monthly_number_of_schools': 'Number of Schools'})

school_count['extraction_timestamp'] = [max_extraction_timestamp_school] * len(final_df)

# Displaying the final dataframe
school_count


year,region_code,region_name,new_la_code,la_name,Month-Year,Number of Schools,Headcount of Pupils,extraction_timestamp
0,E12000001,North East,E06000001,Hartlepool,Jan-2023,39,15156,2024-09-12 12:57:51.034777
1,E12000001,North East,E06000001,Hartlepool,Feb-2023,40,15160,2024-09-12 12:57:51.034777
2,E12000001,North East,E06000001,Hartlepool,Mar-2023,40,15164,2024-09-12 12:57:51.034777
3,E12000001,North East,E06000001,Hartlepool,Apr-2023,40,15167,2024-09-12 12:57:51.034777
4,E12000001,North East,E06000001,Hartlepool,May-2023,40,15171,2024-09-12 12:57:51.034777
...,...,...,...,...,...,...,...,...
1867,E12000009,South West,E10000027,Somerset,Aug-2023,299,81044,2024-09-12 12:57:51.034777
1868,E12000009,South West,E10000027,Somerset,Sep-2023,299,81044,2024-09-12 12:57:51.034777
1869,E12000009,South West,E10000027,Somerset,Oct-2023,299,81044,2024-09-12 12:57:51.034777
1870,E12000009,South West,E10000027,Somerset,Nov-2023,299,81044,2024-09-12 12:57:51.034777


##### Combining processed Table 31 with previous combined tables

In [26]:
# Merging with combined_df8 using 'Local authority code' and 'Transfer Month-Year'
combined_df9 = pd.merge(
    combined_df8,
    school_count,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['new_la_code', 'Month-Year'],
    how='left'
)

combined_df9['extraction_timestamp'] = combined_df9[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns
combined_df9.drop(columns=['region_code', 'region_name', 'new_la_code', 'la_name', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

info_logger.info("Combined table 1 to 10, 13, 17 to 29  with table 31 - School")
# Displaying the final DataFrame
print(combined_df9.head())

     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  Other (number)  \
0           1854   
1           1854   
2           1854   
3           1854   
4           1854   

 

##### Finally, combining Table 32 with all of the previously combined tables

In [27]:
# Adding Month-Year column for 12 months
months = pd.date_range(start='2023-01-01', end='2023-12-01', freq='MS').strftime('%b-%Y').tolist()
Transport = pd.DataFrame(np.repeat(dataframes[30].values, len(months), axis=0), columns=dataframes[30].columns)
Transport['Month-Year'] = months * len(dataframes[30])

# Merging with combined_df0 using 'Local authority code' and 'Transfer Month-Year'
combined_df10 = pd.merge(
    combined_df9,
    Transport,
    left_on=['Local authority code ', 'Transfer Month-Year'],
    right_on=['Local Authority Code', 'Month-Year'],
    how='left'
)

combined_df10['extraction_timestamp'] = combined_df10[['extraction_timestamp_x', 'extraction_timestamp_y']].max(axis=1)

# Dropping the extra columns
combined_df10.drop(columns=['Local Authority Code', 'Region', 'Local Authority [Note 4]', 'Month-Year', 'extraction_timestamp_x', 'extraction_timestamp_y'], inplace=True)

info_logger.info("Combined table 1 to 10, 13, 17 to 29, 31  with table 32 - Transport")
# Displaying the final DataFrame
print(combined_df10.head())


     District Transfer Month-Year   Town/City      County       Price  \
0  HARTLEPOOL            Apr-2023  BILLINGHAM  HARTLEPOOL   7714092.0   
1  HARTLEPOOL            Aug-2023  HARTLEPOOL  HARTLEPOOL  18222858.0   
2  HARTLEPOOL            Dec-2023  BILLINGHAM  HARTLEPOOL  10575089.0   
3  HARTLEPOOL            Feb-2023  BILLINGHAM  HARTLEPOOL  10700793.5   
4  HARTLEPOOL            Jan-2023  BILLINGHAM  HARTLEPOOL   9209592.0   

  Property Type Old/New Duration PPD Category Type Record Status  ...  \
0             D       N        F                 A             A  ...   
1             T       N        F                 A             A  ...   
2             D       N        F                 A             A  ...   
3             D       N        F                 A             A  ...   
4             D       N        F                 A             A  ...   

  Petrol cars total HGV - \nMotorways HGV total Diesel LGV total  \
0         17.376559               0.0   5.94557       

In [28]:
final_extracted_df = combined_df10

In [29]:
final_extracted_df

Unnamed: 0,District,Transfer Month-Year,Town/City,County,Price,Property Type,Old/New,Duration,PPD Category Type,Record Status,...,Petrol cars total,HGV - \nMotorways,HGV total,Diesel LGV total,Petrol LGV total,LPG LGV total,"Personal transport (buses, cars and motorcycles)",Freight transport (HGV and LGV)\n[Note 5],Fuel consumption by all vehicles,extraction_timestamp
0,HARTLEPOOL,Apr-2023,BILLINGHAM,HARTLEPOOL,7.714092e+06,D,N,F,A,A,...,17.376559,0.0,5.94557,7.966492,0.259175,0.000132,31.807181,14.17137,45.97855,2024-09-12 12:57:54.473735
1,HARTLEPOOL,Aug-2023,HARTLEPOOL,HARTLEPOOL,1.822286e+07,T,N,F,A,A,...,17.376559,0.0,5.94557,7.966492,0.259175,0.000132,31.807181,14.17137,45.97855,2024-09-12 12:57:54.473735
2,HARTLEPOOL,Dec-2023,BILLINGHAM,HARTLEPOOL,1.057509e+07,D,N,F,A,A,...,17.376559,0.0,5.94557,7.966492,0.259175,0.000132,31.807181,14.17137,45.97855,2024-09-12 12:57:54.473735
3,HARTLEPOOL,Feb-2023,BILLINGHAM,HARTLEPOOL,1.070079e+07,D,N,F,A,A,...,17.376559,0.0,5.94557,7.966492,0.259175,0.000132,31.807181,14.17137,45.97855,2024-09-12 12:57:54.473735
4,HARTLEPOOL,Jan-2023,BILLINGHAM,HARTLEPOOL,9.209592e+06,D,N,F,A,A,...,17.376559,0.0,5.94557,7.966492,0.259175,0.000132,31.807181,14.17137,45.97855,2024-09-12 12:57:54.473735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6423,WREKIN,Sep-2023,NEWPORT,WREKIN,2.423886e+07,D,N,F,A,A,...,418.676662,32.19039,246.867138,69.795177,0.862299,0.001168,990.353915,317.525782,1307.879696,2024-09-12 12:57:51.426346
6424,WREXHAM,Sep-2023,LLANGOLLEN,WREXHAM,9.443366e+06,D,N,F,A,A,...,612.980547,75.448465,269.532931,408.144414,12.917964,0.006811,1367.290489,690.60212,2057.892609,2024-09-12 12:57:51.426346
6425,WREXHAM,Sep-2023,LLANGOLLEN,WREXHAM,9.443366e+06,D,N,F,A,A,...,1071.503964,246.997711,634.335188,696.832295,22.236031,0.01161,2155.79192,1353.415124,3509.207044,2024-09-12 12:57:51.426346
6426,WREXHAM,Sep-2023,LLANGOLLEN,WREXHAM,9.443366e+06,D,N,F,A,A,...,11294.584426,3057.434575,6209.878673,5870.456868,175.833021,0.090506,20807.608615,12256.259068,33063.867683,2024-09-12 12:57:51.426346


In [30]:
final_extracted_df.to_excel('../Data/Output/Data_Extraction_combined_df.xlsx',index=False)