In [1]:
# Install necessary packages for SQLAlchemy and PostgreSQL support
! pip install sqlalchemy psycopg2

! pip install psycopg2-binary

import numpy as np  # Importing NumPy for numerical operations
import pandas as pd  # Importing Pandas for data manipulation and analysis

from datetime import datetime, timedelta  # Importing datetime and timedelta for handling dates and time deltas

from sqlalchemy import create_engine  # Importing Create_engine for connecting to SQL databases
import joblib  # Importing Joblib for saving and loading Python objects efficiently
import psycopg2  # Importing psycopg2 provides PostgreSQL database adapter for Python
from contextlib import contextmanager  # Importing contextmanager for creating and managing a context manager

from sklearn.base import BaseEstimator, TransformerMixin  # Importing Base classes for custom Transformers
from sklearn.compose import ColumnTransformer  # Importing ColumnTransformer for applying transformers to specific columns
from sklearn.ensemble import RandomForestRegressor  # Importing RandomForest for regression tasks
from sklearn.impute import SimpleImputer  # Importing SimpleImputer for handling missing values
from sklearn.model_selection import train_test_split  # Importing train_test_split to split data into train and test sets
from sklearn.pipeline import Pipeline, FeatureUnion  # Importing Pipeline and FeatureUnion for creating machine learning workflows
from sklearn.preprocessing import (  
    RobustScaler,  # Importing RobustScaler for scaling features robust to outliers
    StandardScaler,  # Importing StandardScaler for standardizing features
    OrdinalEncoder,  # Importing OrdinalEncoder for encoding categorical features
    FunctionTransformer  # Importing FunctionTransformer for creating custom data transformations
)

import logging  # Importing  logging module for event tracking and debugging
import os  # Importing OS module for interacting with the operating system
import warnings  # Warnings module to control warning messages

# Suppress all warnings
warnings.filterwarnings("ignore")

import sys  # Importing  Sys module to interact with the Python runtime environment
sys.path.append('../Scripts/')  # Adding the directory containing .py modules to the system path

# Specific imports from custom scripts
from data_transformations_1 import main_processing_pipeline  # Importing the specific function

from data_transformations_2 import (  # Importing custom transformers and utilities
    group_mean_imputer1,
    group_mean_imputer2,
    feature_creation,
    cap_floor_transformer,
    encoding_transformer,
    robust_scaler_price,
    standard_scaler,
    scaling_normalizing_transformer,
    desired_column_order,
    GroupMeanImputer,
    FeatureCreator,
    EncodingWithNames,
    ColumnTransformerDf,
    ColumnOrderTransformer,
    drop_columns1,
    cap_floor,
    cap_floor_func,
    drop_columns2,
    feature_manipulation,
    handle_infinite_values
)

from data_transformations_3 import apply_imputations, prepare_data  # Importing functions for data preparation






In [2]:

notebook_name = 'Data Incremental Loading' 

# Paths for the log directories
info_log_path = f'../Logs/info/{notebook_name}_info.log'

# Creating directories if they don't exist
os.makedirs(os.path.dirname(info_log_path), exist_ok=True)

# Clearing any previous handlers if re-running this setup
logger = logging.getLogger()
while logger.handlers:
    logger.handlers.pop()

# Configuring logging
info_logger = logging.getLogger('info_logger')

info_handler = logging.FileHandler(info_log_path, mode='a')  # Append mode

info_handler.setLevel(logging.INFO)

# Consistent formatter for both handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
info_handler.setFormatter(formatter)

# Adding handlers to the loggers
info_logger.addHandler(info_handler)

info_logger.setLevel(logging.INFO)


## Data Extraction

In [3]:

# Reading the Data_Extraction_combined_df Excel file into a DataFrame
Data_Extraction_combined_df = pd.read_excel('../Data/Output/Data_Extraction_combined_df.xlsx')

# Creating a copy of the Data_Extraction_combined_df DataFrame for further ETL processing
ETL_extraction_load=Data_Extraction_combined_df.copy()

# Selecting and displaying the 'Transfer Month-Year' and 'Date' columns from the ETL_extraction_load DataFrame
ETL_extraction_load[['Transfer Month-Year', 'Date']]
    
info_logger.info("Read the source data")

### Sample increment Data for Jan 2024

In [4]:

# Defining a function to sample an equal fraction of rows within each group
def sample_equal_within_group(group, frac=0.1):
    return group.sample(frac=frac, random_state=42)

# Applying the sampling function to each group defined by 'Local authority code' and reset the index
grouped_sampled_df = ETL_extraction_load.groupby(['Local authority code ']).apply(sample_equal_within_group).reset_index(drop=True)

# Updating the 'Transfer Month-Year' and 'Date' columns to reflect a new date
grouped_sampled_df['Transfer Month-Year'] = 'Jan-2024'
grouped_sampled_df['Date'] = 'Jan-2024'

# Adding the current timestamp to the 'extraction_timestamp' column
grouped_sampled_df['extraction_timestamp'] = datetime.now() 

# Creating a copy of the sampled data for further testing or incremental data processing
new_incremental_data = grouped_sampled_df.copy()
    
info_logger.info("Sample data generation completed")

#### Metadata timestamp check for filtering requried data

In [5]:

# Establishing a connection to the PostgreSQL database
connection = psycopg2.connect(
    dbname='UK Real Estate DB',
    user='postgres',
    password='123!@*qweQWE',
    host='localhost',
    port='5432'
)

# Listing target tables in the database to check for the last extracted date
target_tables = ['region_dimension', 'date_dimension', 'sales_transactions_fact', 
                 'rental_dimension', 'vehicle_dimension', 'property_type_dimension', 
                 'demographics_dimension', 'education_employment_dimension']

# Initialising a variable to store the minimum last extracted date across all target tables
min_last_extracted_date = None

# Retrieving the minimum last extracted date across all specified target tables
with connection.cursor() as cursor:
    query_metadata = f"""
    SELECT MIN(last_extracted_date) as min_last_extracted_date 
    FROM metadata 
    WHERE table_name IN ({','.join([f"'{table}'" for table in target_tables])});
    """
    cursor.execute(query_metadata)
    result = cursor.fetchone()
    min_last_extracted_date = result[0] if result else None

# Closing the database connection after retrieving the date
connection.close()

# Filtering the incremental data based on the retrieved minimum last extraction date
if min_last_extracted_date is not None:
    # Filtering to get only new data since the minimum last extraction date across all target tables
    filtered_data_for_processing = new_incremental_data[new_incremental_data['extraction_timestamp'] > min_last_extracted_date]
    record_count = filtered_data_for_processing.shape[0]
    
    if record_count > 0:
        print(f"Processing {record_count} new records after filtering by the minimum last extraction date: {min_last_extracted_date}.")
    else:
        print("No new data to process after filtering.")
else:
    print("No previous extraction date found for any target table. Processing all available data.")
    filtered_data_for_transformation = new_incremental_data
    record_count = filtered_data_for_processing.shape[0]
    
    if record_count > 0:
        print(f"Processing {record_count} records.")
    else:
        print("No data available for processing.")

# The DataFrame `filtered_data_for_processing` now contains records with an extraction_timestamp greater than min_last_extracted_date
    
info_logger.info("Required data identified and filtered")

Processing 348 new records after filtering by the minimum last extraction date: 2024-09-12 13:39:05.010491.


In [6]:

# Creating a copy of the filtered data for further transformations and store it in `final_extracted_df`
final_extracted_df = filtered_data_for_processing.copy()


In [7]:
final_extracted_df.to_excel("../Data/Output/final_extracted_df.xlsx", index=False)

## Data Transformation

### Preprocessing steps

In [8]:
    
# calling saved preprocessing pipeline on sampled extracted df
original_cleaned_df = main_processing_pipeline("../Data/Output/final_extracted_df.xlsx")
    
info_logger.info("Applied preprocessing pipeline stage")


[1mDropping Unnecessary column:[0m
Dropped 'extraction_timestamp' column.

[1mDuplicate check:[0m
No duplicate rows found in the DataFrame.

[1mColumn Renaming:[0m
Renamed columns.

[1mNull values in Each record:[0m
Column: District, Number of Null Records: 0
Column: Transfer Month-Year, Number of Null Records: 0
Column: Town/City, Number of Null Records: 55
Column: County, Number of Null Records: 55
Column: Price, Number of Null Records: 55
Column: Property Type, Number of Null Records: 55
Column: Old/New, Number of Null Records: 55
Column: Duration, Number of Null Records: 55
Column: PPD Category Type, Number of Null Records: 55
Column: Record Status, Number of Null Records: 55
Column: Region code, Number of Null Records: 0
Column: Region name, Number of Null Records: 0
Column: Local authority code, Number of Null Records: 0
Column: Local authority name, Number of Null Records: 0
Column: Date, Number of Null Records: 0
Column: RegionName, Number of Null Records: 26
Column: A

Data saved to '../Data/Output/original_cleaned_df.xlsx'


In [9]:
original_cleaned_df 

Unnamed: 0,District,Transfer Month-Year,Town/City,County,Price,Property Type,Old/New,Duration,PPD Category Type,Record Status,...,Diesel cars total,Petrol cars total,HGV - Motorways,HGV total,Diesel LGV total,Petrol LGV total,LPG LGV total,"Personal transport (buses, cars and motorcycles)",Freight transport (HGV and LGV),Fuel consumption by all vehicles
0,HARTLEPOOL,Jan-2024,BILLINGHAM,HARTLEPOOL,8.218558e+06,D,N,F,A,A,...,12.728113,17.376559,0.00000,5.945570,7.966492,0.259175,1.321797e-04,31.807181,14.171370,45.978550
1,MIDDLESBROUGH,Jan-2024,MIDDLESBROUGH,MIDDLESBROUGH,2.853917e+07,T,N,F,A,A,...,27.592611,40.284392,0.00000,9.553060,16.150149,0.554463,2.714230e-04,71.986032,26.257943,98.243975
2,REDCAR AND CLEVELAND,Jan-2024,GUISBOROUGH,REDCAR AND CLEVELAND,6.770916e+06,S,N,F,A,A,...,18.833614,24.777259,0.00000,6.076368,10.949645,0.357697,1.841292e-04,46.393679,17.383894,63.777573
3,STOCKTON-ON-TEES,Jan-2024,BILLINGHAM,STOCKTON-ON-TEES,1.022323e+07,D,N,F,A,A,...,28.473491,38.617891,0.00000,15.601339,17.406072,0.557174,2.874927e-04,70.363456,33.564873,103.928329
4,DARLINGTON,Jan-2024,DARLINGTON,DARLINGTON,1.498214e+07,T,N,F,A,A,...,16.422963,21.811948,5.67299,10.391988,10.159270,0.327314,1.695507e-04,40.606431,20.878742,61.485173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,SUTTON,Jan-2024,BANSTEAD,GREATER LONDON,1.106853e+07,F,N,F,A,A,...,16.226703,27.953133,0.00000,2.649249,10.255186,0.126071,6.002093e-10,47.318465,13.030506,60.348972
317,TOWER HAMLETS,Jan-2024,LONDON,GREATER LONDON,1.081940e+08,F,N,L,A,A,...,16.663450,27.981879,0.00000,8.356507,15.295616,0.260907,0.000000e+00,49.841541,23.913031,73.754572
318,WALTHAM FOREST,Jan-2024,LONDON,GREATER LONDON,5.870112e+07,F,N,F,A,A,...,14.199792,24.431921,0.00000,6.094199,10.936386,0.151264,9.536322e-09,41.783746,17.181848,58.965594
319,WANDSWORTH,Jan-2024,LONDON,GREATER LONDON,2.927223e+08,F,N,L,A,A,...,14.453811,23.925686,0.00000,4.476196,11.080560,0.149713,0.000000e+00,44.308423,15.706469,60.014892


In [10]:

ETL_Database_Df = original_cleaned_df.copy() # Creating a copy of the cleaned DataFrame to use for the ETL process.

# Loading the saved first stage pipeline for preprocessing
ETL_first_stage_pipeline = joblib.load('../Models/first_stage_pipeline.pkl')

# Applying the first stage of preprocessing to the ETL database DataFrame
ETL_first_stage_output = ETL_first_stage_pipeline.transform(ETL_Database_Df)

# Loading the saved second stage pipeline for further preprocessing
ETL_second_stage_pipeline = joblib.load('../Models/second_stage_pipeline.pkl')

# Applying the second stage of preprocessing to the output from the first stage
ETL_second_stage_output = ETL_second_stage_pipeline.transform(ETL_first_stage_output)
   
info_logger.info("Applied full stage pipeline")

In [11]:

# Using the function from the script to perform imputations
ETL_second_stage_output = apply_imputations(ETL_second_stage_output)

# Preparing data by splitting into features and targets and performing a train-test split
ETL_second_stage_output, ETL_X, ETL_y, X_train, X_test, y_train, y_test = prepare_data(ETL_second_stage_output)

### Append Predicted Price

In [12]:

ETL_output = ETL_second_stage_output.copy()

# Loading the final combined features DataFrame from the earlier saved excel file
final_combined_features_df = pd.read_excel('../Data/Output/final_combined_features.xlsx')

# Extracting the list of selected feature names from the DataFrame
ETL_selected_features = final_combined_features_df['Feature'].tolist()

# Subsetting the unseen data to include only the selected features
ETL_X = ETL_output[ETL_selected_features]  

# Loading the pre-trained Random Forest model
rf_model = joblib.load('../Models/RandomForest_house_price_prediction_model.pkl')

# Predicting the target variable using the Random Forest model
predictions = rf_model.predict(ETL_X)

# Adding the predictions to the DataFrame
ETL_second_stage_output['Predicted_Price'] = predictions

# Displaying the predictions
print("Predicted Prices:")
print(predictions)
    
info_logger.info("Applied Price prediction model to predict price on incremental data")

Predicted Prices:
[-1.91902154e-01  4.62258262e-01 -2.97732569e-01 -5.78272797e-02
 -3.61363657e-02 -2.86150357e-01  1.20127991e+00 -2.45971435e-01
  1.44290673e-01  8.18628783e-01 -7.42279321e-03 -1.38141372e-01
 -4.41899806e-01  2.90982648e+00  2.12396331e+00  2.06903734e+00
 -1.53917496e-01  3.17242494e+00 -2.92411953e-01  1.55151184e+00
  1.25940198e+00  3.57780808e-01  8.47843639e-01  2.97289485e+00
  1.34613177e-01  3.25033036e+00  8.44634204e-01  1.78455247e+00
  5.31607998e-01 -1.86378707e-01  9.06543592e-01  8.93117083e-02
  3.56359451e-01  3.21471266e+00  1.22838291e+00  9.12970881e-02
  9.39141911e-01  8.98222269e-01  3.24286955e+00  8.72378095e-01
  2.42751044e+00 -4.27767024e-01  7.80796644e-02 -1.98721015e-02
 -1.00067548e-01 -1.06701118e-01 -4.34724707e-02  4.88563049e-02
  7.82486097e-01 -1.45104678e-01 -1.22846847e-01  1.47835657e+00
  4.65669644e-02 -1.09046944e-02  3.13098490e-02  2.59068634e-01
 -1.25589977e-01 -3.93325725e-01 -5.70818459e-03 -3.69380272e-03
  3.133

In [13]:
# Loading the pipeline used for scaling
pipeline = joblib.load('../Models/second_stage_pipeline.pkl')

# Accessing the RobustScaler used for price scaling from the pipeline
price_scaler = pipeline.named_steps['scale_normalize'].named_transformers_['robust_scaler_price']

# Reshaping predictions to match the scaler's expected input dimensions
predictions_scaled_reshaped = predictions.reshape(-1, 1)

# Reversing the scaling transformation to obtain unscaled predictions
predictions_unscaled = price_scaler.inverse_transform(predictions_scaled_reshaped).flatten()

ETL_stage_output = ETL_first_stage_output.copy()

# Adding the unscaled predictions to the DataFrame
ETL_stage_output['Predicted_Price_Unscaled'] = predictions_unscaled

# Displaying the actual and predicted (unscaled) prices
ETL_stage_output[['Price','Predicted_Price_Unscaled']]
    
info_logger.info("Applied reverse scaling and appended the predicted price to original dataframe")

In [14]:

# List of columns to fill null values
null_columns = [
    'AveragePrice_PctChange',
    'DetachedPrice_PctChange',
    'SemiDetachedPrice_PctChange',
    'TerracedPrice_PctChange',
    'FlatPrice_PctChange'
]

# Filling NaN values in specified columns by grouping and applying mean imputation
ETL_stage_output[null_columns] = ETL_stage_output.groupby(['Local authority code', 'Region code'])[null_columns].transform(lambda x: x.fillna(x.mean()))
ETL_stage_output[null_columns] = ETL_stage_output.groupby(['Region code'])[null_columns].transform(lambda x: x.fillna(x.mean()))


### Column renaming and Feature addition

In [15]:
# Renaming columns in ETL_stage_output DataFrame for consistency and clarity
ETL_stage_output = ETL_stage_output.rename(columns={
    'District': 'district',
    'Town/City': 'town_city',
    'County': 'county',
    'Price': 'price',
    'Property Type': 'property_type',
    'Old/New': 'old_new',
    'Duration': 'duration',
    'PPD Category Type': 'ppd_category_type',
    'Record Status': 'record_status',
    'Region code': 'region_code',
    'Region name': 'region_name',
    'Local authority code': 'local_authority_code',
    'Local authority name': 'local_authority_name',
    'Date': 'date',
    'Month': 'month',
    'Quarter': 'quarter',
    'Year': 'year',
    'Transfer Month-Year': 'transfer_month_year',
    'Index': 'index',
    'AveragePrice': 'average_price',
    'All ages': 'all_ages',
    '0-20': 'age_0_20',
    '20-40': 'age_20_40',
    '40-60': 'age_40_60',
    '60+': 'age_60_plus',
    'Female population': 'female_population',
    'Male population': 'male_population',
    'Area (sq km)': 'area_sq_km',
    'Qualification index score': 'qualification_index_score',
    'Qualification index rank (1 to 331)': 'qualification_index_rank',
    'No qualifications': 'no_qualifications',
    'Level 1 and entry level qualifications': 'level_1_and_entry_level_qualifications',
    'Level 2 qualifications': 'level_2_qualifications',
    'Apprenticeship': 'apprenticeship',
    'Level 3 qualifications': 'level_3_qualifications',
    'Level 4 qualifications and above': 'level_4_qualifications_and_above',
    'Other qualifications': 'other_qualifications',
    'Estimated number of households with at least 1 early-years or school age child': 'est_num_households_with_child',
    'Deprivation Average Score': 'deprivation_average_score',
    'Number of those aged 16+ who are unemployed': 'num_aged_16_plus_unemployed',
    'Number of those aged 16+ in employment who are employees': 'num_aged_16_plus_employed',
    'Number of those aged 16+ in employment who are self-employed': 'num_aged_16_plus_self_employed',
    'One Bedroom Rent': 'one_bedroom_rent',
    'Two Bedrooms Rent': 'two_bedrooms_rent',
    'Three Bedrooms Rent': 'three_bedrooms_rent',
    'Four or more Bedrooms Rent': 'four_or_more_bedrooms_rent',
    'All categories Rent': 'all_categories_rent',
    'GDHI': 'gdhi',
    'Buses total': 'buses_total',
    'Diesel cars total': 'diesel_cars_total',
    'Petrol cars total': 'petrol_cars_total',
    'HGV - Motorways': 'hgv_motorways',
    'HGV total': 'hgv_total',
    'Diesel LGV total': 'diesel_lgv_total',
    'Petrol LGV total': 'petrol_lgv_total',
    'LPG LGV total': 'lpg_lgv_total',
    'Personal transport (buses, cars and motorcycles)': 'personal_transport',
    'Freight transport (HGV and LGV)': 'freight_transport',
    'Fuel consumption by all vehicles': 'fuel_consumption',
    '1m%Change': 'one_m_percent_change',
    'Annual change (%)': 'annual_change_percent',
    'Rental price (£)': 'rental_price',
    'SalesVolume': 'sales_volume',
    'DetachedPrice': 'detached_price',
    'SemiDetachedPrice': 'semi_detached_price',
    'TerracedPrice': 'terraced_price',
    'FlatPrice': 'flat_price',
    'CashPrice': 'cash_price',
    'MortgagePrice': 'mortgage_price',
    'MortgageIndex': 'mortgage_index',
    'FTBPrice': 'ftb_price',
    'FOOPrice': 'foo_price',
    'NewPrice': 'new_price',
    'OldPrice': 'old_price',
    'OldSalesVolume': 'old_sales_volume',
    'Detached_SemiDetached_Ratio': 'detached_semi_detached_ratio',
    'Detached_Terraced_Ratio': 'detached_terraced_ratio',
    'Detached_Flat_Ratio': 'detached_flat_ratio',
    'AveragePrice_PctChange': 'average_price_pct_change',
    'DetachedPrice_PctChange': 'detached_price_pct_change',
    'SemiDetachedPrice_PctChange': 'semi_detached_price_pct_change',
    'TerracedPrice_PctChange': 'terraced_price_pct_change',
    'FlatPrice_PctChange': 'flat_price_pct_change',
    'SalesVolume_log': 'sales_volume_log',
    'DetachedPrice_log': 'detached_price_log',
    'SemiDetachedPrice_log': 'semi_detached_price_log',
    'TerracedPrice_log': 'terraced_price_log',
    'FlatPrice_log': 'flat_price_log',
    'AveragePrice_log': 'average_price_log',
    'Predicted_Price_Unscaled': 'predicted_price_unscaled'
})

# Additionally derived features for analysis and additional insight

ETL_stage_output['deprivation_adjusted_gdhi'] = ETL_stage_output['gdhi'] / ETL_stage_output['deprivation_average_score']
ETL_stage_output['gdhi_per_capita'] = ETL_stage_output['gdhi'] / (ETL_stage_output['age_0_20'] + ETL_stage_output['age_20_40'] + ETL_stage_output['age_40_60'] + ETL_stage_output['age_60_plus'])
ETL_stage_output['deprivation_employment_ratio'] = (ETL_stage_output['num_aged_16_plus_employed'] + ETL_stage_output['num_aged_16_plus_self_employed']) / ETL_stage_output['deprivation_average_score']
ETL_stage_output['qualification_adjusted_employment_rate'] = ETL_stage_output['num_aged_16_plus_employed'] / ETL_stage_output['qualification_index_score']
ETL_stage_output['housing_demand_indicator'] = (ETL_stage_output['rental_price'] + ETL_stage_output['sales_volume']) / ETL_stage_output['area_sq_km']
ETL_stage_output['age_dependency_ratio'] = (ETL_stage_output['age_0_20'] + ETL_stage_output['age_60_plus']) / (ETL_stage_output['age_20_40'] + ETL_stage_output['age_40_60'])
ETL_stage_output['deprivation_reduction_potential'] = ETL_stage_output['qualification_index_score'] / ETL_stage_output['deprivation_average_score']
    
info_logger.info("Column renaming and column addition completed ")

## Data Loading

In [16]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

@contextmanager
def get_db_connection():
    """Context manager for PostgreSQL database connection."""
    connection = psycopg2.connect(**DB_CONFIG)
    try:
        yield connection
    finally:
        connection.close()

def create_temp_table(cursor, table_name, schema):
    """Creates a temporary table."""
    create_query = f"CREATE TEMPORARY TABLE {table_name} ({schema});"
    cursor.execute(create_query)

def insert_data(cursor, table_name, columns, data_frame):
    """Inserts data into the temporary table."""
    columns_str = ', '.join(columns)
    values_str = ', '.join(['%s'] * len(columns))
    insert_query = f"INSERT INTO {table_name} ({columns_str}) VALUES ({values_str});"
    for _, row in data_frame[columns].drop_duplicates().iterrows():
        cursor.execute(insert_query, tuple(row))

def update_existing_records(cursor, update_query):
    """Updates existing records in the target table."""
    cursor.execute(update_query)

def insert_new_records(cursor, insert_query):
    """Inserts new records into the target table."""
    cursor.execute(insert_query)

def drop_temp_table(cursor, table_name):
    """Drops the temporary table."""
    cursor.execute(f"DROP TABLE {table_name};")

def load_region_dimension(incremental_df):
    """Performs incremental load for the region_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated region data
        create_temp_table(cursor, 'region_dimension_temp',
            'region_code VARCHAR(255), region_name VARCHAR(255), local_authority_code VARCHAR(255), local_authority_name VARCHAR(255)')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'region_dimension_temp', 
                    ['region_code', 'region_name', 'local_authority_code', 'local_authority_name'], 
                    incremental_df)
        
        # Updating existing records in the region_dimension table
        update_existing_records(cursor, """
        UPDATE region_dimension
        SET 
            region_code = temp.region_code,
            region_name = temp.region_name,
            local_authority_name = temp.local_authority_name
        FROM region_dimension_temp temp
        WHERE region_dimension.local_authority_code = temp.local_authority_code
        AND (
            region_dimension.region_code != temp.region_code OR
            region_dimension.region_name != temp.region_name OR
            region_dimension.local_authority_name != temp.local_authority_name
        );
        """)
        
        # Inserting new records into the region_dimension table
        insert_new_records(cursor, """
        INSERT INTO region_dimension (region_code, region_name, local_authority_code, local_authority_name)
        SELECT temp.region_code, temp.region_name, temp.local_authority_code, temp.local_authority_name
        FROM region_dimension_temp temp
        LEFT JOIN region_dimension rd ON rd.local_authority_code = temp.local_authority_code
        WHERE rd.local_authority_code IS NULL;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'region_dimension_temp')
        connection.commit()
        print("Incremental load for region_dimension completed.")  
        info_logger.info("Testing completed for Region dimension")


def load_date_dimension(incremental_df):
    """Performs incremental load for the date_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table
        create_temp_table(cursor, 'date_dimension_temp',
            'date DATE, month INT, quarter INT, year INT, transfer_month_year VARCHAR(255)')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'date_dimension_temp', 
                    ['date', 'month', 'quarter', 'year', 'transfer_month_year'], 
                    incremental_df)
        
        # Inserting new records into the date_dimension table
        insert_new_records(cursor, """
        INSERT INTO date_dimension (date, month, quarter, year, transfer_month_year)
        SELECT temp.date, temp.month, temp.quarter, temp.year, temp.transfer_month_year
        FROM date_dimension_temp temp
        LEFT JOIN date_dimension dd ON dd.date = temp.date
        WHERE dd.date IS NULL;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'date_dimension_temp')
        connection.commit()
        print("Incremental load for date_dimension completed.")
        info_logger.info("Testing completed for Date dimension")


def load_vehicle_dimension(incremental_df):
    """Performs incremental load for the vehicle_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated vehicle data
        create_temp_table(cursor, 'vehicle_dimension_temp',
            'local_authority_code VARCHAR(255), buses_total FLOAT, petrol_cars_total FLOAT, hgv_total FLOAT, petrol_lgv_total FLOAT, lpg_lgv_total FLOAT, hgv_motorways FLOAT, personal_transport FLOAT')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'vehicle_dimension_temp', 
                    ['local_authority_code', 'buses_total', 'petrol_cars_total', 'hgv_total', 'petrol_lgv_total', 'lpg_lgv_total', 'hgv_motorways', 'personal_transport'], 
                    incremental_df)
        
        # Updating existing records in the vehicle_dimension table
        update_existing_records(cursor, """
        UPDATE vehicle_dimension
        SET 
            region_id = rd.region_id,
            buses_total = temp.buses_total,
            petrol_cars_total = temp.petrol_cars_total,
            hgv_total = temp.hgv_total,
            petrol_lgv_total = temp.petrol_lgv_total,
            lpg_lgv_total = temp.lpg_lgv_total,             
            hgv_motorways = temp.hgv_motorways,            
            personal_transport = temp.personal_transport    
        FROM vehicle_dimension_temp temp
        JOIN region_dimension rd ON temp.local_authority_code = rd.local_authority_code
        WHERE vehicle_dimension.local_authority_code = temp.local_authority_code
        AND (
            vehicle_dimension.buses_total != temp.buses_total OR
            vehicle_dimension.petrol_cars_total != temp.petrol_cars_total OR
            vehicle_dimension.hgv_total != temp.hgv_total OR
            vehicle_dimension.petrol_lgv_total != temp.petrol_lgv_total OR
            vehicle_dimension.lpg_lgv_total != temp.lpg_lgv_total OR            
            vehicle_dimension.hgv_motorways != temp.hgv_motorways OR            
            vehicle_dimension.personal_transport != temp.personal_transport );
        """)
        
        # Inserting new records into the vehicle_dimension table for new vehicles not previously recorded
        insert_new_records(cursor, """
        INSERT INTO vehicle_dimension (local_authority_code, region_id, buses_total, petrol_cars_total, hgv_total, petrol_lgv_total, lpg_lgv_total, hgv_motorways, personal_transport)
        SELECT temp.local_authority_code, rd.region_id, temp.buses_total, temp.petrol_cars_total, temp.hgv_total, temp.petrol_lgv_total, temp.lpg_lgv_total, temp.hgv_motorways, temp.personal_transport
        FROM vehicle_dimension_temp temp
        JOIN region_dimension rd ON temp.local_authority_code = rd.local_authority_code
        LEFT JOIN vehicle_dimension vd ON vd.local_authority_code = temp.local_authority_code
        WHERE vd.local_authority_code IS NULL;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'vehicle_dimension_temp')
        connection.commit()
        print("Incremental load for vehicle_dimension completed.")
        info_logger.info("Testing completed for Vechile dimension")


def load_rental_dimension(incremental_df):
    """Performs incremental load for the rental_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated rental data
        create_temp_table(cursor, 'rental_dimension_temp',
            'local_authority_code VARCHAR(255), date DATE, rental_price FLOAT, one_bedroom_rent FLOAT, two_bedrooms_rent FLOAT, three_bedrooms_rent FLOAT, four_or_more_bedrooms_rent FLOAT, all_categories_rent FLOAT')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'rental_dimension_temp', 
                    ['local_authority_code', 'date', 'rental_price', 'one_bedroom_rent', 'two_bedrooms_rent', 'three_bedrooms_rent', 'four_or_more_bedrooms_rent', 'all_categories_rent'], 
                    incremental_df)
        
        # Updating existing records to set end_date and is_current status
        update_existing_records(cursor, """
        UPDATE rental_dimension rd
        SET end_date = NOW(), is_current = FALSE
        FROM rental_dimension_temp temp
        WHERE rd.local_authority_code = temp.local_authority_code
        AND rd.is_current = TRUE
        AND (
            rd.date != temp.date OR
            rd.rental_price != temp.rental_price OR
            rd.one_bedroom_rent != temp.one_bedroom_rent OR
            rd.two_bedrooms_rent != temp.two_bedrooms_rent OR
            rd.three_bedrooms_rent !=temp.three_bedrooms_rent OR
            rd.four_or_more_bedrooms_rent != temp.four_or_more_bedrooms_rent OR
            rd.all_categories_rent != temp.all_categories_rent
        );
        """)
        
        # Inserting new or updated records into the rental_dimension table
        insert_new_records(cursor, """
        INSERT INTO rental_dimension (local_authority_code, date, rental_price, one_bedroom_rent, two_bedrooms_rent, three_bedrooms_rent, four_or_more_bedrooms_rent, all_categories_rent, start_date, end_date, is_current)
        SELECT temp.local_authority_code, temp.date, temp.rental_price, temp.one_bedroom_rent, temp.two_bedrooms_rent, temp.three_bedrooms_rent, temp.four_or_more_bedrooms_rent, temp.all_categories_rent, NOW() AS start_date, NULL AS end_date, TRUE AS is_current
        FROM (
            SELECT temp.*, 
                   ROW_NUMBER() OVER (PARTITION BY temp.local_authority_code ORDER BY temp.date DESC) AS rn
            FROM rental_dimension_temp temp
        ) temp
        LEFT JOIN rental_dimension rd ON rd.local_authority_code = temp.local_authority_code
        AND rd.is_current = TRUE
        WHERE (rd.local_authority_code IS NULL OR (
                rd.rental_price != temp.rental_price OR
                rd.one_bedroom_rent != temp.one_bedroom_rent OR
                rd.two_bedrooms_rent != temp.two_bedrooms_rent OR
                rd.three_bedrooms_rent != temp.three_bedrooms_rent OR
                rd.four_or_more_bedrooms_rent != temp.four_or_more_bedrooms_rent OR
                rd.all_categories_rent != temp.all_categories_rent
            ))
        AND temp.rn = 1;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'rental_dimension_temp')
        connection.commit()
        print("Incremental load for rental_dimension completed.")
        info_logger.info("Testing completed for Rental dimension")


def load_district_dimension(incremental_df):
    """Performs incremental load for the district_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated district data
        create_temp_table(cursor, 'district_dimension_temp',
            'local_authority_code VARCHAR(255), date DATE, district VARCHAR(255), town_city VARCHAR(255), county VARCHAR(255)')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'district_dimension_temp', 
                    ['local_authority_code', 'date', 'district', 'town_city', 'county'], 
                    incremental_df)
        
        # Updating existing records to set end_date and is_current status
        update_existing_records(cursor, """
        UPDATE district_dimension dd
        SET end_date = NOW(), is_current = FALSE
        FROM district_dimension_temp temp
        WHERE dd.local_authority_code = temp.local_authority_code
        AND dd.is_current = TRUE
        AND (
            dd.date != temp.date OR
            dd.district != temp.district OR
            dd.town_city != temp.town_city OR
            dd.county != temp.county
        );
        """)
        
        # Inserting new or updated records into the district_dimension table
        insert_new_records(cursor, """
        INSERT INTO district_dimension (local_authority_code, date, district, town_city, county, start_date, end_date, is_current)
        SELECT temp.local_authority_code, temp.date, temp.district, temp.town_city, temp.county, NOW() AS start_date, NULL AS end_date, TRUE AS is_current
        FROM (
            SELECT temp.*, 
                   ROW_NUMBER() OVER (PARTITION BY temp.local_authority_code ORDER BY temp.date DESC) AS rn
            FROM district_dimension_temp temp
        ) temp
        LEFT JOIN district_dimension dd ON dd.local_authority_code = temp.local_authority_code
        AND dd.is_current = TRUE
        WHERE (dd.local_authority_code IS NULL OR (
                dd.district != temp.district OR
                dd.town_city != temp.town_city OR
                dd.county != temp.county
            ))
        AND temp.rn = 1;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'district_dimension_temp')
        connection.commit()
        print("Incremental load for district_dimension completed.")
        info_logger.info("Testing completed for District dimension")


def load_property_type_dimension(incremental_df):
    """Performs incremental load for the property_type_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated property type data
        create_temp_table(cursor, 'property_type_dimension_temp',
            'local_authority_code VARCHAR(255), date DATE, property_type VARCHAR(255), duration VARCHAR(255), detached_price FLOAT, semi_detached_price FLOAT, terraced_price FLOAT, flat_price FLOAT')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'property_type_dimension_temp', 
                    ['local_authority_code', 'date', 'property_type', 'duration', 'detached_price', 'semi_detached_price', 'terraced_price', 'flat_price'], 
                    incremental_df)
        
        # Updating existing records to set end_date and is_current status
        update_existing_records(cursor, """
        UPDATE property_type_dimension pd
        SET end_date = NOW(), is_current = FALSE
        FROM property_type_dimension_temp temp
        WHERE pd.local_authority_code = temp.local_authority_code
        AND pd.is_current = TRUE
        AND (
            pd.property_type = temp.property_type OR
            pd.duration = temp.duration OR
            pd.date != temp.date OR
            pd.detached_price != temp.detached_price OR
            pd.semi_detached_price != temp.semi_detached_price OR
            pd.terraced_price != temp.terraced_price OR
            pd.flat_price != temp.flat_price
        );
        """)
        
        # Inserting new or updated records into the property_type_dimension table
        insert_new_records(cursor, """
        INSERT INTO property_type_dimension (local_authority_code, date, property_type, duration, detached_price, semi_detached_price, terraced_price, flat_price, start_date, end_date, is_current)
        SELECT temp.local_authority_code, temp.date, temp.property_type, temp.duration, temp.detached_price, temp.semi_detached_price, temp.terraced_price, temp.flat_price, NOW() AS start_date, NULL AS end_date, TRUE AS is_current
        FROM (
            SELECT temp.*, 
                   ROW_NUMBER() OVER (PARTITION BY temp.local_authority_code, temp.property_type, temp.duration ORDER BY temp.date DESC) AS rn
            FROM property_type_dimension_temp temp
        ) temp
        LEFT JOIN property_type_dimension pd ON pd.local_authority_code = temp.local_authority_code
        AND pd.is_current = TRUE
        WHERE (pd.local_authority_code IS NULL OR (
                pd.property_type = temp.property_type OR
                pd.duration = temp.duration OR
                pd.detached_price != temp.detached_price OR
                pd.semi_detached_price != temp.semi_detached_price OR
                pd.terraced_price != temp.terraced_price OR
                pd.flat_price != temp.flat_price
            ))
        AND temp.rn = 1;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'property_type_dimension_temp')
        connection.commit()
        print("Incremental load for property_type_dimension completed.")
        info_logger.info("Testing completed for Property type dimension")


def load_education_employment_dimension(incremental_df):
    """Performs incremental load for the education_employment_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated education and employment data
        create_temp_table(cursor, 'education_employment_dimension_temp',
            'local_authority_code VARCHAR(255), date DATE, qualification_index_score FLOAT, qualification_index_rank FLOAT, no_qualifications FLOAT, level_1_and_entry_level_qualifications FLOAT, level_2_qualifications FLOAT, level_3_qualifications FLOAT, apprenticeship FLOAT, level_4_qualifications_and_above FLOAT, other_qualifications FLOAT, num_aged_16_plus_unemployed FLOAT, num_aged_16_plus_employed FLOAT, num_aged_16_plus_self_employed FLOAT, deprivation_average_score FLOAT, deprivation_employment_ratio FLOAT, qualification_adjusted_employment_rate FLOAT')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'education_employment_dimension_temp', 
                    ['local_authority_code', 'date', 'qualification_index_score', 'qualification_index_rank', 'no_qualifications', 'level_1_and_entry_level_qualifications', 'level_2_qualifications', 'level_3_qualifications', 'apprenticeship', 'level_4_qualifications_and_above', 'other_qualifications', 'num_aged_16_plus_unemployed', 'num_aged_16_plus_employed', 'num_aged_16_plus_self_employed', 'deprivation_average_score', 'deprivation_employment_ratio', 'qualification_adjusted_employment_rate'], 
                    incremental_df)
        
        # Updating existing records to set end_date and is_current status
        update_existing_records(cursor, """
        UPDATE education_employment_dimension ed
        SET end_date = NOW(), is_current = FALSE
        FROM education_employment_dimension_temp temp
        WHERE ed.local_authority_code = temp.local_authority_code
        AND ed.is_current = TRUE
        AND (
            ed.date != temp.date OR
            ed.qualification_index_score != temp.qualification_index_score OR
            ed.qualification_index_rank != temp.qualification_index_rank OR
            ed.no_qualifications != temp.no_qualifications OR
            ed.level_1_and_entry_level_qualifications != temp.level_1_and_entry_level_qualifications OR
            ed.level_2_qualifications != temp.level_2_qualifications OR
            ed.level_3_qualifications != temp.level_3_qualifications OR
            ed.apprenticeship != temp.apprenticeship OR
            ed.level_4_qualifications_and_above != temp.level_4_qualifications_and_above OR
            ed.other_qualifications != temp.other_qualifications OR
            ed.num_aged_16_plus_unemployed != temp.num_aged_16_plus_unemployed OR
            ed.num_aged_16_plus_employed != temp.num_aged_16_plus_employed OR
            ed.num_aged_16_plus_self_employed != temp.num_aged_16_plus_self_employed OR
            ed.deprivation_average_score != temp.deprivation_average_score OR
            ed.deprivation_employment_ratio != temp.deprivation_employment_ratio OR
            ed.qualification_adjusted_employment_rate != temp.qualification_adjusted_employment_rate
        );
        """)
        
        # Inserting new or updated records into the education_employment_dimension table
        insert_new_records(cursor, """
        INSERT INTO education_employment_dimension (local_authority_code, date, qualification_index_score, 
            qualification_index_rank, no_qualifications, level_1_and_entry_level_qualifications, level_2_qualifications, 
            level_3_qualifications, apprenticeship, level_4_qualifications_and_above, other_qualifications, 
            num_aged_16_plus_unemployed, num_aged_16_plus_employed, num_aged_16_plus_self_employed, 
            deprivation_average_score, deprivation_employment_ratio, qualification_adjusted_employment_rate, 
            start_date, end_date, is_current)
        SELECT temp.local_authority_code, temp.date, temp.qualification_index_score, temp.qualification_index_rank, 
            temp.no_qualifications, temp.level_1_and_entry_level_qualifications, temp.level_2_qualifications, 
            temp.level_3_qualifications, temp.apprenticeship, temp.level_4_qualifications_and_above, temp.other_qualifications, 
            temp.num_aged_16_plus_unemployed, temp.num_aged_16_plus_employed, temp.num_aged_16_plus_self_employed, 
            temp.deprivation_average_score, temp.deprivation_employment_ratio, temp.qualification_adjusted_employment_rate, 
            NOW() AS start_date, NULL AS end_date, TRUE AS is_current
        FROM (
            SELECT temp.*, 
                   ROW_NUMBER() OVER (PARTITION BY temp.local_authority_code ORDER BY temp.date DESC) AS rn
            FROM education_employment_dimension_temp temp
        ) temp
        LEFT JOIN education_employment_dimension ed ON ed.local_authority_code = temp.local_authority_code
        AND ed.is_current = TRUE
        WHERE (ed.local_authority_code IS NULL OR (
                ed.qualification_index_score != temp.qualification_index_score OR
                ed.qualification_index_rank != temp.qualification_index_rank OR
                ed.no_qualifications != temp.no_qualifications OR
                ed.level_1_and_entry_level_qualifications != temp.level_1_and_entry_level_qualifications OR
                ed.level_2_qualifications != temp.level_2_qualifications OR
                ed.level_3_qualifications != temp.level_3_qualifications OR
                ed.apprenticeship != temp.apprenticeship OR
                ed.level_4_qualifications_and_above != temp.level_4_qualifications_and_above OR
                ed.other_qualifications != temp.other_qualifications OR
                ed.num_aged_16_plus_unemployed != temp.num_aged_16_plus_unemployed OR
                ed.num_aged_16_plus_employed != temp.num_aged_16_plus_employed OR
                ed.num_aged_16_plus_self_employed != temp.num_aged_16_plus_self_employed OR
                ed.deprivation_average_score != temp.deprivation_average_score OR
                ed.deprivation_employment_ratio != temp.deprivation_employment_ratio OR
                ed.qualification_adjusted_employment_rate != temp.qualification_adjusted_employment_rate 
            ))
        AND temp.rn = 1;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'education_employment_dimension_temp')
        connection.commit()
        print("Incremental load for education_employment_dimension completed.")
        info_logger.info("Testing completed for Education employment dimension")


def load_demographics_dimension(incremental_df):
    """Performs incremental load for the demographics_dimension table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated demographic data
        create_temp_table(cursor, 'demographics_dimension_temp',
            'local_authority_code VARCHAR(255), date DATE, area_sq_km FLOAT, age_0_20 FLOAT, age_20_40 FLOAT, age_40_60 FLOAT, age_60_plus FLOAT, female_population FLOAT, all_ages FLOAT, male_population FLOAT, age_dependency_ratio FLOAT, est_num_households_with_child FLOAT')
        
        # Inserting data into the temporary table
        insert_data(cursor, 'demographics_dimension_temp', 
                    ['local_authority_code', 'date', 'area_sq_km', 'age_0_20', 'age_20_40', 'age_40_60', 'age_60_plus', 'female_population', 'all_ages', 'male_population', 'age_dependency_ratio', 'est_num_households_with_child'], 
                    incremental_df)
        
        # Updating existing records to set end_date and is_current status
        update_existing_records(cursor, """
        UPDATE demographics_dimension dd
        SET end_date = NOW(), is_current = FALSE
        FROM demographics_dimension_temp temp
        WHERE dd.local_authority_code = temp.local_authority_code
        AND dd.is_current = TRUE
        AND (
            dd.date != temp.date OR
            dd.area_sq_km != temp.area_sq_km OR
            dd.age_0_20 != temp.age_0_20 OR
            dd.age_20_40 != temp.age_20_40 OR
            dd.age_40_60 != temp.age_40_60 OR
            dd.age_60_plus != temp.age_60_plus OR
            dd.female_population != temp.female_population OR
            dd.all_ages != temp.all_ages OR
            dd.male_population != temp.male_population OR
            dd.age_dependency_ratio != temp.age_dependency_ratio OR
            dd.est_num_households_with_child != temp.est_num_households_with_child
        );
        """)
        
        # Inserting new or updated records into the demographics_dimension table
        insert_new_records(cursor, """
        INSERT INTO demographics_dimension (local_authority_code, date, area_sq_km, age_0_20, age_20_40, age_40_60, age_60_plus, female_population, all_ages, male_population, age_dependency_ratio, est_num_households_with_child, start_date, end_date, is_current)
        SELECT temp.local_authority_code, temp.date, temp.area_sq_km, temp.age_0_20, temp.age_20_40, temp.age_40_60, temp.age_60_plus, temp.female_population, temp.all_ages, temp.male_population, temp.age_dependency_ratio, temp.est_num_households_with_child, NOW() AS start_date, NULL AS end_date, TRUE AS is_current
        FROM (
            SELECT temp.*, 
                   ROW_NUMBER() OVER (PARTITION BY temp.local_authority_code ORDER BY temp.date DESC) AS rn
            FROM demographics_dimension_temp temp
        ) temp
        LEFT JOIN demographics_dimension dd ON dd.local_authority_code = temp.local_authority_code
        AND dd.is_current = TRUE
        WHERE (dd.local_authority_code IS NULL OR (
                dd.area_sq_km != temp.area_sq_km OR
                dd.age_0_20 != temp.age_0_20 OR
                dd.age_20_40 != temp.age_20_40 OR
                dd.age_40_60 != temp.age_40_60 OR
                dd.age_60_plus != temp.age_60_plus OR
                dd.female_population != temp.female_population OR
                dd.all_ages != temp.all_ages OR
                dd.male_population != temp.male_population OR
                dd.age_dependency_ratio != temp.age_dependency_ratio OR
                dd.est_num_households_with_child != temp.est_num_households_with_child
            ))
        AND temp.rn = 1;
        """)
        
        # Dropping the temporary table
        drop_temp_table(cursor, 'demographics_dimension_temp')
        connection.commit()
        print("Incremental load for demographics_dimension completed.")
        info_logger.info("Testing completed for Demographics dimension")


def load_sales_transaction_fact(incremental_df):
    """Performs incremental load for the sales_transactions_fact table."""
    with get_db_connection() as connection:
        cursor = connection.cursor()
        
        # Creating a temporary table to hold new and updated sales transaction data
        create_temp_table(cursor, 'sales_transactions_fact_temp',
            'local_authority_code VARCHAR(255), date DATE, price NUMERIC, average_price FLOAT, predicted_price_unscaled FLOAT, index FLOAT, average_price_pct_change FLOAT, annual_change_percent FLOAT, new_price FLOAT, old_price FLOAT, sales_volume FLOAT, sales_volume_log FLOAT, old_sales_volume FLOAT, detached_flat_ratio FLOAT, detached_terraced_ratio FLOAT, semi_detached_price_pct_change FLOAT, detached_semi_detached_ratio FLOAT, detached_price_log FLOAT, semi_detached_price_log FLOAT, flat_price_log FLOAT, terraced_price_pct_change FLOAT, terraced_price_log FLOAT, gdhi FLOAT, deprivation_adjusted_gdhi FLOAT, gdhi_per_capita FLOAT, foo_price FLOAT, cash_price FLOAT, mortgage_price FLOAT, housing_demand_indicator FLOAT, deprivation_reduction_potential FLOAT, flat_price_pct_change FLOAT, detached_price_pct_change FLOAT, average_price_log FLOAT, ftb_price FLOAT')

        # Inserting data into the temporary table
        insert_data(cursor, 'sales_transactions_fact_temp',
                    ['local_authority_code', 'date', 'price', 'average_price', 'predicted_price_unscaled', 'index', 'average_price_pct_change', 'annual_change_percent', 'new_price', 'old_price', 'sales_volume', 'sales_volume_log', 'old_sales_volume', 'detached_flat_ratio', 'detached_terraced_ratio', 'semi_detached_price_pct_change', 'detached_semi_detached_ratio', 'detached_price_log', 'semi_detached_price_log', 'flat_price_log', 'terraced_price_pct_change', 'terraced_price_log', 'gdhi', 'deprivation_adjusted_gdhi', 'gdhi_per_capita', 'foo_price', 'cash_price', 'mortgage_price', 'housing_demand_indicator', 'deprivation_reduction_potential', 'flat_price_pct_change', 'detached_price_pct_change', 'average_price_log', 'ftb_price'],
                    incremental_df)
        
        # Updating existing records if they have changed
        update_existing_records(cursor, """
        UPDATE sales_transactions_fact AS stf
        SET end_date = NOW(), is_current = FALSE
        FROM sales_transactions_fact_temp AS temp
        WHERE stf.local_authority_code = temp.local_authority_code
        AND stf.is_current = TRUE
        AND (
            stf.date = temp.date OR
            stf.price != temp.price OR
            stf.average_price != temp.average_price OR
            stf.predicted_price_unscaled != temp.predicted_price_unscaled OR
            stf.index != temp.index OR
            stf.average_price_pct_change != temp.average_price_pct_change OR
            stf.annual_change_percent != temp.annual_change_percent OR
            stf.new_price != temp.new_price OR
            stf.old_price != temp.old_price OR
            stf.sales_volume != temp.sales_volume OR
            stf.sales_volume_log != temp.sales_volume_log OR
            stf.old_sales_volume != temp.old_sales_volume OR
            stf.detached_flat_ratio != temp.detached_flat_ratio OR
            stf.detached_terraced_ratio != temp.detached_terraced_ratio OR
            stf.semi_detached_price_pct_change != temp.semi_detached_price_pct_change OR
            stf.detached_semi_detached_ratio != temp.detached_semi_detached_ratio OR
            stf.detached_price_log != temp.detached_price_log OR
            stf.semi_detached_price_log != temp.semi_detached_price_log OR
            stf.flat_price_log != temp.flat_price_log OR
            stf.terraced_price_pct_change != temp.terraced_price_pct_change OR
            stf.terraced_price_log != temp.terraced_price_log OR
            stf.gdhi != temp.gdhi OR
            stf.deprivation_adjusted_gdhi != temp.deprivation_adjusted_gdhi OR
            stf.gdhi_per_capita != temp.gdhi_per_capita OR
            stf.foo_price != temp.foo_price OR
            stf.cash_price != temp.cash_price OR
            stf.mortgage_price != temp.mortgage_price OR
            stf.housing_demand_indicator != temp.housing_demand_indicator OR
            stf.deprivation_reduction_potential != temp.deprivation_reduction_potential OR
            stf.flat_price_pct_change != temp.flat_price_pct_change OR
            stf.detached_price_pct_change != temp.detached_price_pct_change OR
            stf.average_price_log != temp.average_price_log OR
            stf.ftb_price != temp.ftb_price
        );
        """)

        # Inserting new records with current status as active and end_date as NULL
        insert_new_records(cursor, """
        INSERT INTO sales_transactions_fact (
            local_authority_code, date, district_id, region_id, property_type_id, vehicle_id,
            rental_id, demographics_id, education_employment_id, date_key, price, average_price,
            predicted_price_unscaled, index, average_price_pct_change, annual_change_percent,
            new_price, old_price, sales_volume, sales_volume_log,
            old_sales_volume, detached_flat_ratio, detached_terraced_ratio, semi_detached_price_pct_change,
            detached_semi_detached_ratio, detached_price_log, semi_detached_price_log,
            flat_price_log, terraced_price_pct_change, terraced_price_log, gdhi,
            deprivation_adjusted_gdhi, gdhi_per_capita, foo_price, cash_price, mortgage_price,
            housing_demand_indicator, deprivation_reduction_potential, flat_price_pct_change, detached_price_pct_change, average_price_log, ftb_price, start_date, end_date, is_current
        )
        SELECT
            temp.local_authority_code, temp.date, 
            dd.district_id, rd.region_id, ptd.property_type_id, vd.vehicle_id, 
            rentald.rental_id, ddemo.demographics_id, eed.education_employment_id, d.date_key,
            temp.price, temp.average_price, temp.predicted_price_unscaled, temp.index, 
            temp.average_price_pct_change, temp.annual_change_percent, 
            temp.new_price, temp.old_price, temp.sales_volume, temp.sales_volume_log, 
            temp.old_sales_volume, temp.detached_flat_ratio, temp.detached_terraced_ratio, 
            temp.semi_detached_price_pct_change, temp.detached_semi_detached_ratio, 
            temp.detached_price_log, temp.semi_detached_price_log, temp.flat_price_log, 
            temp.terraced_price_pct_change, temp.terraced_price_log, temp.gdhi, 
            temp.deprivation_adjusted_gdhi, temp.gdhi_per_capita, temp.foo_price, temp.cash_price, 
            temp.mortgage_price, temp.housing_demand_indicator, temp.deprivation_reduction_potential,
            temp.flat_price_pct_change, temp.detached_price_pct_change, temp.average_price_log, temp.ftb_price,
            NOW() AS start_date, NULL AS end_date, TRUE AS is_current
        FROM 
            sales_transactions_fact_temp temp
        JOIN 
            district_dimension dd ON dd.local_authority_code = temp.local_authority_code AND dd.date = temp.date
        JOIN 
            region_dimension rd ON rd.local_authority_code = temp.local_authority_code
        JOIN 
            property_type_dimension ptd ON ptd.local_authority_code = temp.local_authority_code AND ptd.date = temp.date
        JOIN 
            vehicle_dimension vd ON vd.local_authority_code = temp.local_authority_code
        JOIN 
            rental_dimension rentald ON rentald.local_authority_code = temp.local_authority_code AND rentald.date = temp.date
        JOIN 
            demographics_dimension ddemo ON ddemo.local_authority_code = temp.local_authority_code AND ddemo.date = temp.date
        JOIN 
            education_employment_dimension eed ON eed.local_authority_code = temp.local_authority_code AND eed.date = temp.date
        JOIN 
            date_dimension d ON d.date = temp.date
        LEFT JOIN 
            sales_transactions_fact stf ON stf.local_authority_code = temp.local_authority_code AND stf.date = temp.date AND stf.is_current = TRUE
        WHERE 
            stf.sales_id IS NULL;
        """)
    
        # Dropping the temporary table
        drop_temp_table(cursor, 'sales_transactions_fact_temp')
    
        # Commiting changes to the database
        connection.commit()
        print("Incremental load for sales_transactions_fact completed.")
        info_logger.info("Testing completed for Sales Transaction Fact dimension")




In [17]:

def main():
    """
    Main function to perform incremental data loading into all dimension and fact tables.
    """
    # Loading data into the region_dimension table
    load_region_dimension(incremental_load_df)

    # Loading data into the date_dimension table
    load_date_dimension(incremental_load_df)

    # Loading data into the vehicle_dimension table
    load_vehicle_dimension(incremental_load_df)

    # Loading data into the rental_dimension table
    load_rental_dimension(incremental_load_df)

    # Loading data into the district_dimension table
    load_district_dimension(incremental_load_df)

    # Loading data into the property_type_dimension table
    load_property_type_dimension(incremental_load_df)

    # Loading data into the education_employment_dimension table
    load_education_employment_dimension(incremental_load_df)

    # Loading data into the demographics_dimension table
    load_demographics_dimension(incremental_load_df)

    # Loading data into the sales_transactions_fact table
    load_sales_transaction_fact(incremental_load_df)

    print("All dimensions and facts have been incrementally loaded successfully.")

# Entry point for script execution
if __name__ == "__main__":
    
    incremental_load_df = ETL_stage_output.copy()
    #Executing main function
    main()


Incremental load for region_dimension completed.
Incremental load for date_dimension completed.
Incremental load for vehicle_dimension completed.


Incremental load for rental_dimension completed.
Incremental load for district_dimension completed.
Incremental load for property_type_dimension completed.


Incremental load for education_employment_dimension completed.
Incremental load for demographics_dimension completed.


Incremental load for sales_transactions_fact completed.
All dimensions and facts have been incrementally loaded successfully.


In [18]:
##incremental call

connection = psycopg2.connect(
    dbname='UK Real Estate DB',
    user='postgres',
    password='123!@*qweQWE',
    host='localhost',
    port='5432'
)
cursor = connection.cursor()

# Listing target tables and their SCD types
tables_metadata = [
    {"table_name": "region_dimension"},
    {"table_name": "date_dimension"},
    {"table_name": "sales_transactions_fact"},
    {"table_name": "vehicle_dimension"},
    {"table_name": "rental_dimension"},
    {"table_name": "demographics_dimension"},
    {"table_name": "education_employment_dimension"},
    {"table_name": "property_type_dimension"},
    {"table_name": "district_dimension"}
]

# Setting the extraction timestamp as the current time
last_extracted_date = datetime.now()

# Updating metadata records
for table in tables_metadata:
    update_metadata_query = """
    UPDATE metadata
    SET last_extracted_date = %s,
        last_modified_date = %s
    WHERE table_name = %s;
    """
    cursor.execute(update_metadata_query, (
        last_extracted_date,
        last_extracted_date,
        table['table_name']
    ))

# Committing the transaction
connection.commit()

# Closing the connection
cursor.close()
connection.close()

print("Metadata for all tables updated successfully.")
info_logger.info("Metadata for all tables updated successfully")



Metadata for all tables updated successfully.


In [19]:

db_user = 'postgres'
db_password = '123%21%40*qweQWE'
db_host = 'localhost' 
db_port = '5432'  
db_name = 'UK Real Estate DB'

# Creating the database connection
engine = create_engine(f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')

# Loading ETL_stage_output DataFrame into a staging table in the database
incremental_load_df.to_sql('incremental_load_etl_source_data', engine, index=False, if_exists='replace')

292