In [1]:
import numpy as np  # Importing NumPy is used for numerical operations
import pandas as pd  # Importing Pandas provides powerful data structures for data analysis

# sklearn's import statements for machine learning models and utilities
from sklearn.ensemble import RandomForestRegressor  # Importing RandomForest for regression tasks
from sklearn.base import BaseEstimator, TransformerMixin  # Importing Base classes for custom transformers
from sklearn.pipeline import Pipeline, FeatureUnion  # Importing Pipeline and FeatureUnion for creating ML workflows
from sklearn.compose import ColumnTransformer  # Importing ColumnTransformer for applying transformers to specific columns
from sklearn.preprocessing import (  # Importing Preprocessing methods for data transformation
    RobustScaler,  # RobustScaler for scaling features robust to outliers
    StandardScaler,  # StandardScaler for standardizing features
    OrdinalEncoder,  # OrdinalEncoder for encoding categorical features
    FunctionTransformer  # FunctionTransformer for custom data transformations
)
from sklearn.model_selection import train_test_split  # Splitting data into training and testing sets
from sklearn.impute import SimpleImputer  # Importing SimpleImputer for handling missing values

import logging  # Importing logging module for event tracking and debugging
import os  # Importing OS module for interacting with the operating system
import joblib  # Importing Joblib for saving and loading models efficiently
import warnings  # Importing Warnings module to control warning messages

# Suppress all warnings
warnings.filterwarnings("ignore")

import sys  # Importing  Sys module to interact with the Python runtime environment
sys.path.append('../Scripts/')  # Adding the directory containing .py modules to the system path

# importing specific custom scripts for data transformation and preparation
from data_transformations_2 import (
    group_mean_imputer1,
    group_mean_imputer2,
    feature_creation,
    cap_floor_transformer,
    encoding_transformer,
    robust_scaler_price,
    standard_scaler,
    scaling_normalizing_transformer,
    desired_column_order,
    GroupMeanImputer,
    FeatureCreator,
    EncodingWithNames,
    ColumnTransformerDf,
    ColumnOrderTransformer,
    drop_columns1,
    cap_floor,
    cap_floor_func,
    drop_columns2,
    feature_manipulation,
    handle_infinite_values
)

from data_transformations_3 import apply_imputations, prepare_data  # Importing functions for data preparation


In [2]:

notebook_name = 'Data Transformation' 

# Paths for the log directories
info_log_path = f'../Logs/info/{notebook_name}_info.log'

# Creating directories if they don't exist
os.makedirs(os.path.dirname(info_log_path), exist_ok=True)

# Clearing any previous handlers if re-running this setup
logger = logging.getLogger()
while logger.handlers:
    logger.handlers.pop()

# Configuring logging
info_logger = logging.getLogger('info_logger')

info_handler = logging.FileHandler(info_log_path, mode='a')  # Append mode

info_handler.setLevel(logging.INFO)

# Consistent formatter for both handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
info_handler.setFormatter(formatter)

# Adding handlers to the loggers
info_logger.addHandler(info_handler)

info_logger.setLevel(logging.INFO)


In [3]:

# Reading the original_cleaned_df Excel file from Previous phase
original_cleaned_df = pd.read_excel('../Data/Output/original_cleaned_df.xlsx')

# Reading the final_combined_features Excel file from Previous phase
final_selected_features_details = pd.read_excel('../Data/Output/final_combined_features.xlsx')

info_logger.info("Reading Extracted data")

## Calculate predicted Price column on whole dataframe and append it to original dataframe to be used in ETL schema

In [4]:

# Creating a copy of the cleaned original DataFrame to be used for ETL processes and database operations
ETL_Database_Df = original_cleaned_df.copy()

# Loading the first stage of the preprocessing pipeline from a saved file for ETL processing
ETL_first_stage_pipeline = joblib.load('../Models/first_stage_pipeline.pkl')

# Applying the first stage of the preprocessing pipeline to the ETL dataset
ETL_first_stage_output = ETL_first_stage_pipeline.transform(ETL_Database_Df)

# Loading the second stage of the preprocessing pipeline from a saved file for further ETL processing
ETL_second_stage_pipeline = joblib.load('../Models/second_stage_pipeline.pkl')

# Applying the second stage of the preprocessing pipeline to the output from the first stage
ETL_second_stage_output = ETL_second_stage_pipeline.transform(ETL_first_stage_output)

info_logger.info("Applied previously identified full stage pipeline")

In [5]:
null_counts = ETL_second_stage_output.isnull().sum()

# Printing the column names and their respective number of null records
for col, count in null_counts.items():
    print(f"Column: {col}, Number of Null Records: {count}")

# displaying the total number of null records in the entire DataFrame
total_null_records = ETL_second_stage_output.isnull().sum().sum()
print(f"Total Number of Null Records in DataFrame: {total_null_records}")

# Saving the DataFrame to an Excel file in the specified output directory to be utilized during Loading phase
ETL_second_stage_output.to_excel("../Data/Output/ETL_second_stage_output.xlsx", index=False)

print("Data saved to 'ETL_second_stage_output.xlsx'")

Column: Price, Number of Null Records: 0
Column: Property Type, Number of Null Records: 0
Column: Duration, Number of Null Records: 0
Column: PPD Category Type, Number of Null Records: 0
Column: Region code, Number of Null Records: 0
Column: Local authority code, Number of Null Records: 0
Column: AveragePrice, Number of Null Records: 0
Column: Index, Number of Null Records: 0
Column: 1m%Change, Number of Null Records: 0
Column: SalesVolume, Number of Null Records: 0
Column: DetachedPrice, Number of Null Records: 0
Column: SemiDetachedPrice, Number of Null Records: 0
Column: TerracedPrice, Number of Null Records: 0
Column: FlatPrice, Number of Null Records: 0
Column: CashPrice, Number of Null Records: 0
Column: MortgagePrice, Number of Null Records: 0
Column: MortgageIndex, Number of Null Records: 0
Column: FTBPrice, Number of Null Records: 0
Column: FOOPrice, Number of Null Records: 0
Column: NewPrice, Number of Null Records: 0
Column: OldPrice, Number of Null Records: 0
Column: OldSal

Data saved to 'ETL_second_stage_output.xlsx'


In [6]:

# Using the function from the script to perform imputations
ETL_second_stage_output = apply_imputations(ETL_second_stage_output)

# Preparing data by splitting into features and targets and performing a train-test split
ETL_second_stage_output, ETL_X, ETL_y, X_train, X_test, y_train, y_test = prepare_data(ETL_second_stage_output)

In [7]:

# Creating a copy of the second stage ETL output to preserve the original data
ETL_output = ETL_second_stage_output.copy()

# Extracting the list of selected feature names for model prediction from the final combined features DataFrame
ETL_selected_features = final_selected_features_details['Feature'].tolist()

# Subsetting the ETL output data to include only the selected features for prediction
ETL_X = ETL_output[ETL_selected_features] 

# Loading the pre-trained Random Forest model from the saved file for predictions
rf_model = joblib.load('../Models/RandomForest_house_price_prediction_model.pkl')

# Predicting the target variable (house prices) using the finalized Random Forest model
predictions = rf_model.predict(ETL_X)

# Appending the predicted house prices to the ETL DataFrame for subsequent analysis and reporting
ETL_second_stage_output['Predicted_Price'] = predictions

print("Predicted Prices:")
print(predictions)

info_logger.info("Predicted price on entire dataset")

Predicted Prices:
[-0.18342536 -0.16069205 -0.17538145 ...  4.50022816  4.50500712
  4.54870722]


#### Since the predictions were done after scaling, need to reverse scaling before appending the data to original dataframe

In [8]:
# Loading the second-stage pipeline, which includes the scaling process, from the saved file
pipeline = joblib.load('../Models/second_stage_pipeline.pkl')  

# Retrieving the scaler used for price normalisation from the pipeline
price_scaler = pipeline.named_steps['scale_normalize'].named_transformers_['robust_scaler_price']

# Reshaping the predicted values to match the input shape expected by the scaler
predictions_scaled_reshaped = predictions.reshape(-1, 1)

# Applying inverse transformation to convert the scaled predictions back to their original scale
predictions_unscaled = price_scaler.inverse_transform(predictions_scaled_reshaped).flatten()

info_logger.info("Reverse scaled predicted price")

In [9]:
ETL_stage_output = ETL_first_stage_output.copy()

In [10]:
ETL_stage_output['Predicted_Price_Unscaled'] = predictions_unscaled

ETL_stage_output[['Price','Predicted_Price_Unscaled']]

info_logger.info("Appended predicted price with original extracted, preprocessed data ")

In [11]:

# Listing columns where null values need to be imputed
null_columns = [
    'AveragePrice_PctChange',
    'DetachedPrice_PctChange',
    'SemiDetachedPrice_PctChange',
    'TerracedPrice_PctChange',
    'FlatPrice_PctChange'
]

# Imputing missing values in the specified columns using the mean of the groups defined by 'Local authority code' and 'Region code'
ETL_stage_output[null_columns] = ETL_stage_output.groupby(['Local authority code', 'Region code'])[null_columns].transform(lambda x: x.fillna(x.mean()))
# Further, imputing any remaining missing values in the specified columns using the mean of the groups defined by 'Region code'
ETL_stage_output[null_columns] = ETL_stage_output.groupby(['Region code'])[null_columns].transform(lambda x: x.fillna(x.mean()))


In [12]:
# Checking for null records and print the results with the number of null records per column
null_counts = ETL_stage_output.isnull().sum()

# Printing the column names and their respective number of null records
for col, count in null_counts.items():
    print(f"Column: {col}, Number of Null Records: {count}")

# displaying the total number of null records in the entire DataFrame
total_null_records = ETL_stage_output.isnull().sum().sum()
print(f"Total Number of Null Records in DataFrame: {total_null_records}")


Column: District, Number of Null Records: 0
Column: Transfer Month-Year, Number of Null Records: 0
Column: Town/City, Number of Null Records: 0
Column: County, Number of Null Records: 0
Column: Price, Number of Null Records: 0
Column: Property Type, Number of Null Records: 0
Column: Old/New, Number of Null Records: 0
Column: Duration, Number of Null Records: 0
Column: PPD Category Type, Number of Null Records: 0
Column: Record Status, Number of Null Records: 0
Column: Region code, Number of Null Records: 0
Column: Region name, Number of Null Records: 0
Column: Local authority code, Number of Null Records: 0
Column: Local authority name, Number of Null Records: 0
Column: Date, Number of Null Records: 0
Column: RegionName, Number of Null Records: 0
Column: AreaCode, Number of Null Records: 0
Column: AveragePrice, Number of Null Records: 0
Column: Index, Number of Null Records: 0
Column: 1m%Change, Number of Null Records: 0
Column: SalesVolume, Number of Null Records: 0
Column: DetachedPr

In [13]:
# Saving the DataFrame to an Excel file
ETL_stage_output.to_excel("../Data/Output/ETL_stage_output.xlsx", index=False)

print("Data saved to 'ETL_stage_output.xlsx'")

Data saved to 'ETL_stage_output.xlsx'


## Perform rest of the transformation steps- Column renaming

In [14]:
# Creating a complete mapping dictionary to rename feature columns for consistency
feature_mapping = {
    'Rental price (£)': 'rental_price',
    'Number of those aged 16+ who are unemployed': 'num_aged_16_plus_unemployed',
    'Number of those aged 16+ in employment who are self-employed': 'num_aged_16_plus_self_employed',
    'FOOPrice': 'foo_price',
    'Local authority code': 'local_authority_code',
    '20-40': 'age_20_40',
    'All categories Rent': 'all_categories_rent',
    'Area (sq km)': 'area_sq_km',
    'One Bedroom Rent': 'one_bedroom_rent',
    'Qualification index rank (1 to 331)': 'qualification_index_rank',
    '60+': 'age_60_plus',
    'Detached_Flat_Ratio': 'detached_flat_ratio',
    'GDHI': 'gdhi',
    'DetachedPrice': 'detached_price',
    'SemiDetachedPrice': 'semi_detached_price',
    'OldPrice': 'old_price',
    'SalesVolume': 'sales_volume',
    'NewPrice': 'new_price',
    '1m%Change': 'one_m_percent_change',
    'DetachedPrice_log': 'detached_price_log',
    'Two Bedrooms Rent': 'two_bedrooms_rent',
    'CashPrice': 'cash_price',
    'Petrol LGV total': 'petrol_lgv_total',
    'HGV total': 'hgv_total',
    'Month': 'month',
    'SemiDetachedPrice_PctChange': 'semi_detached_price_pct_change',
    'Detached_Terraced_Ratio': 'detached_terraced_ratio',
    '0-20': 'age_0_20',
    'SemiDetachedPrice_log': 'semi_detached_price_log',
    '40-60': 'age_40_60',
    'Petrol cars total': 'petrol_cars_total',
    'MortgagePrice': 'mortgage_price',
    'SalesVolume_log': 'sales_volume_log',
    'OldSalesVolume': 'old_sales_volume',
    'AveragePrice_PctChange': 'average_price_pct_change',
    'Deprivation Average Score': 'deprivation_average_score',
    'Freight transport (HGV and LGV)': 'freight_transport',
    'Buses total': 'buses_total',
    'FlatPrice': 'flat_price',
    'Other qualifications': 'other_qualifications',
    'FlatPrice_log': 'flat_price_log',
    'No qualifications': 'no_qualifications',
    'Diesel cars total': 'diesel_cars_total',
    'Detached_SemiDetached_Ratio': 'detached_semi_detached_ratio',
    'Region code': 'region_code',
    'Qualification index score': 'qualification_index_score',
    'Number of those aged 16+ in employment who are employees': 'num_aged_16_plus_employed',
    'Property Type': 'property_type',
    'TerracedPrice_PctChange': 'terraced_price_pct_change',
    'TerracedPrice_log': 'terraced_price_log',
    'District': 'district',
    'Transfer month year': 'transfer_month_year',
    'Transfer Month-Year': 'transfer_month_year',
    'Town city': 'town_city',
    'Town/City': 'town_city',
    'County': 'county',
    'Price': 'price',
    'Property type': 'property_type',
    'Old new': 'old_new',
    'Old/New': 'old_new',
    'Duration': 'duration',
    'PPD category type': 'ppd_category_type',
    'Record status': 'record_status',
    'Region code': 'region_code',
    'Region name': 'region_name',
    'Local authority code': 'local_authority_code',
    'Local authority name': 'local_authority_name',
    'Date': 'date',
    'AveragePrice': 'average_price',
    'Index': 'index',
    'One m percent change': 'one_m_percent_change',
    'Sales volume': 'sales_volume',
    'Detached price': 'detached_price',
    'Semi detached price': 'semi_detached_price',
    'TerracedPrice': 'terraced_price',
    'Flat price': 'flat_price',
    'Cash price': 'cash_price',
    'Mortgage price': 'mortgage_price',
    'Mortgage index': 'mortgage_index',
    'MortgageIndex': 'mortgage_index',
    'FTB price': 'ftb_price',
    'FOO price': 'foo_price',
    'New price': 'new_price',
    'Old price': 'old_price',
    'FTBPrice': 'ftb_price',
    'FOOPrice': 'foo_price',
    'NewPrice': 'new_price',
    'OldPrice': 'old_price',
    'Old sales volume': 'old_sales_volume',
    'Annual change percent': 'annual_change_percent',
    'Annual change (%)': 'annual_change_percent',
    'Rental price': 'rental_price',
    'One bedroom rent': 'one_bedroom_rent',
    'Two bedrooms rent': 'two_bedrooms_rent',
    'Three Bedrooms Rent': 'three_bedrooms_rent',
    'Four or more Bedrooms Rent': 'four_or_more_bedrooms_rent',
    'All categories rent': 'all_categories_rent',
    'All ages': 'all_ages',
    'Age 0 20': 'age_0_20',
    'Age 20 40': 'age_20_40',
    'Age 40 60': 'age_40_60',
    'Age 60 plus': 'age_60_plus',
    'Female population': 'female_population',
    'Male population': 'male_population',
    'Area sq km': 'area_sq_km',
    'Qualification index score': 'qualification_index_score',
    'Qualification index rank': 'qualification_index_rank',
    'No qualifications': 'no_qualifications',
    'Level 1 and entry level qualifications': 'level_1_and_entry_level_qualifications',
    'Level 2 qualifications': 'level_2_qualifications',
    'Apprenticeship': 'apprenticeship',
    'Level 3 qualifications': 'level_3_qualifications',
    'Level 4 qualifications and above': 'level_4_qualifications_and_above',
    'Other qualifications': 'other_qualifications',
    'Est num households with child': 'est_num_households_with_child',
    'Estimated number of households with at least 1 early-years or school age child': 'est_num_households_with_child',
    'Deprivation average score': 'deprivation_average_score',
    'Num aged 16 plus unemployed': 'num_aged_16_plus_unemployed',
    'Num aged 16 plus employed': 'num_aged_16_plus_employed',
    'Num aged 16 plus self employed': 'num_aged_16_plus_self_employed',
    'GDHI': 'gdhi',
    'Buses total': 'buses_total',
    'Diesel cars total': 'diesel_cars_total',
    'Petrol cars total': 'petrol_cars_total',
    'HGV motorways': 'hgv_motorways',
    'HGV - Motorways': 'hgv_motorways',
    'HGV total': 'hgv_total',
    'Diesel LGV total': 'diesel_lgv_total',
    'Petrol LGV total': 'petrol_lgv_total',
    'LPG LGV total': 'lpg_lgv_total',
    'Personal transport': 'personal_transport',
    'Personal transport (buses, cars and motorcycles)': 'personal_transport',
    'Freight transport': 'freight_transport',
    'Fuel consumption': 'fuel_consumption',
    'Month': 'month',
    'Quarter': 'quarter',
    'Year': 'year',
    'Detached_SemiDetached_Ratio': 'detached_semi_detached_ratio',
    'Detached_Terraced_Ratio': 'detached_terraced_ratio',
    'Detached_Flat_Ratio': 'detached_flat_ratio',
    'AveragePrice_PctChange': 'average_price_pct_change',
    'DetachedPrice_PctChange': 'detached_price_pct_change',
    'SemiDetachedPrice_PctChange': 'semi_detached_price_pct_change',
    'TerracedPrice_PctChange': 'terraced_price_pct_change',
    'FlatPrice_PctChange': 'flat_price_pct_change',
    'SalesVolume_log': 'sales_volume_log',
    'DetachedPrice_log': 'detached_price_log',
    'SemiDetachedPrice_log': 'semi_detached_price_log',
    'TerracedPrice_log': 'terraced_price_log',
    'FlatPrice_log': 'flat_price_log',
    'AveragePrice_log': 'average_price_log',
    'Predicted Price Unscaled': 'predicted_price_unscaled',
}

# Applying the feature mapping to the 'Feature' column for renaming
final_selected_features_details['Feature'] = final_selected_features_details['Feature'].replace(feature_mapping)

# Converting all column names to lowercase for uniformity
final_selected_features_details = final_selected_features_details.rename(columns=lambda x: x.lower())

print("Final Selected Features Details:")
for col in final_selected_features_details.items():
    print(f"Column: {col}")

# Renaming columns in the ETL_stage_output DataFrame for consistency and readability
ETL_stage_output = ETL_stage_output.rename(columns={
    'District': 'district',
    'Town/City': 'town_city',
    'County': 'county',
    'Price': 'price',
    'Property Type': 'property_type',
    'Old/New': 'old_new',
    'Duration': 'duration',
    'PPD Category Type': 'ppd_category_type',
    'Record Status': 'record_status',
    'Region code': 'region_code',
    'Region name': 'region_name',
    'Local authority code': 'local_authority_code',
    'Local authority name': 'local_authority_name',
    'Date': 'date',
    'Month': 'month',
    'Quarter': 'quarter',
    'Year': 'year',
    'Transfer Month-Year': 'transfer_month_year',
    'Index': 'index',
    'AveragePrice': 'average_price',
    'All ages': 'all_ages',
    '0-20': 'age_0_20',
    '20-40': 'age_20_40',
    '40-60': 'age_40_60',
    '60+': 'age_60_plus',
    'Female population': 'female_population',
    'Male population': 'male_population',
    'Area (sq km)': 'area_sq_km',
    'Qualification index score': 'qualification_index_score',
    'Qualification index rank (1 to 331)': 'qualification_index_rank',
    'No qualifications': 'no_qualifications',
    'Level 1 and entry level qualifications': 'level_1_and_entry_level_qualifications',
    'Level 2 qualifications': 'level_2_qualifications',
    'Apprenticeship': 'apprenticeship',
    'Level 3 qualifications': 'level_3_qualifications',
    'Level 4 qualifications and above': 'level_4_qualifications_and_above',
    'Other qualifications': 'other_qualifications',
    'Estimated number of households with at least 1 early-years or school age child': 'est_num_households_with_child',
    'Deprivation Average Score': 'deprivation_average_score',
    'Number of those aged 16+ who are unemployed': 'num_aged_16_plus_unemployed',
    'Number of those aged 16+ in employment who are employees': 'num_aged_16_plus_employed',
    'Number of those aged 16+ in employment who are self-employed': 'num_aged_16_plus_self_employed',
    'One Bedroom Rent': 'one_bedroom_rent',
    'Two Bedrooms Rent': 'two_bedrooms_rent',
    'Three Bedrooms Rent': 'three_bedrooms_rent',
    'Four or more Bedrooms Rent': 'four_or_more_bedrooms_rent',
    'All categories Rent': 'all_categories_rent',
    'GDHI': 'gdhi',
    'Buses total': 'buses_total',
    'Diesel cars total': 'diesel_cars_total',
    'Petrol cars total': 'petrol_cars_total',
    'HGV - Motorways': 'hgv_motorways',
    'HGV total': 'hgv_total',
    'Diesel LGV total': 'diesel_lgv_total',
    'Petrol LGV total': 'petrol_lgv_total',
    'LPG LGV total': 'lpg_lgv_total',
    'Personal transport (buses, cars and motorcycles)': 'personal_transport',
    'Freight transport (HGV and LGV)': 'freight_transport',
    'Fuel consumption by all vehicles': 'fuel_consumption',
    '1m%Change': 'one_m_percent_change',
    'Annual change (%)': 'annual_change_percent',
    'Rental price (£)': 'rental_price',
    'SalesVolume': 'sales_volume',
    'DetachedPrice': 'detached_price',
    'SemiDetachedPrice': 'semi_detached_price',
    'TerracedPrice': 'terraced_price',
    'FlatPrice': 'flat_price',
    'CashPrice': 'cash_price',
    'MortgagePrice': 'mortgage_price',
    'MortgageIndex': 'mortgage_index',
    'FTBPrice': 'ftb_price',
    'FOOPrice': 'foo_price',
    'NewPrice': 'new_price',
    'OldPrice': 'old_price',
    'OldSalesVolume': 'old_sales_volume',
    'Detached_SemiDetached_Ratio': 'detached_semi_detached_ratio',
    'Detached_Terraced_Ratio': 'detached_terraced_ratio',
    'Detached_Flat_Ratio': 'detached_flat_ratio',
    'AveragePrice_PctChange': 'average_price_pct_change',
    'DetachedPrice_PctChange': 'detached_price_pct_change',
    'SemiDetachedPrice_PctChange': 'semi_detached_price_pct_change',
    'TerracedPrice_PctChange': 'terraced_price_pct_change',
    'FlatPrice_PctChange': 'flat_price_pct_change',
    'SalesVolume_log': 'sales_volume_log',
    'DetachedPrice_log': 'detached_price_log',
    'SemiDetachedPrice_log': 'semi_detached_price_log',
    'TerracedPrice_log': 'terraced_price_log',
    'FlatPrice_log': 'flat_price_log',
    'AveragePrice_log': 'average_price_log',
    'Predicted_Price_Unscaled': 'predicted_price_unscaled'
})
# Displaying the DataFrame to verify that the renaming has been applied correctly
print("ETL_stage_output columns: \n",ETL_stage_output.columns)

info_logger.info("Completed Feature renaming")

Final Selected Features Details:
Column: ('feature', 0     level_4_qualifications_and_above
1                            age_20_40
2                     one_bedroom_rent
3                  all_categories_rent
4                    two_bedrooms_rent
                    ...               
66             detached_terraced_ratio
67                 detached_flat_ratio
68                             quarter
69                       hgv_motorways
70                  personal_transport
Name: feature, Length: 71, dtype: object)
Column: ('importance', 0     0.530738
1     0.521151
2     0.514827
3     0.494157
4     0.493181
        ...   
66    0.001923
67    0.000836
68    0.000648
69    0.000615
70    0.000514
Name: importance, Length: 71, dtype: float64)
ETL_stage_output columns: 
 Index(['district', 'transfer_month_year', 'town_city', 'county', 'price',
       'property_type', 'old_new', 'duration', 'ppd_category_type',
       'record_status', 'region_code', 'region_name', 'local_authority_c

## Perform rest of the transformation steps- Feature addition

In [15]:
# Extracting the list of selected features from the final_selected_features_details DataFrame
selected_features = final_selected_features_details['feature'].tolist()

# Filtering the ETL_stage_output DataFrame to include only the selected features
data_load_df = ETL_stage_output[selected_features]

# Listing additional columns based on data setup to include in the data_load_df for comprehensive analysis
additional_columns = [
    'district', 'transfer_month_year', 'town_city', 'county', 'price', 
    'region_name', 'local_authority_name', 'date', 'year', 'predicted_price_unscaled'
]

# Combining the selected features with the additional columns for a complete dataset
all_columns = selected_features + additional_columns

# Filtering the ETL_stage_output DataFrame to include both the selected features and additional columns
data_load_df = ETL_stage_output[all_columns]

# Additionally derived features for analysis and additional insight

data_load_df['deprivation_adjusted_gdhi'] = data_load_df['gdhi'] / data_load_df['deprivation_average_score']
data_load_df['gdhi_per_capita'] = data_load_df['gdhi'] / (data_load_df['age_0_20'] + data_load_df['age_20_40'] + data_load_df['age_40_60'] + data_load_df['age_60_plus'])
data_load_df['deprivation_employment_ratio'] = (data_load_df['num_aged_16_plus_employed'] + data_load_df['num_aged_16_plus_self_employed']) / data_load_df['deprivation_average_score']
data_load_df['qualification_adjusted_employment_rate'] = data_load_df['num_aged_16_plus_employed'] / data_load_df['qualification_index_score']
data_load_df['housing_demand_indicator'] = (data_load_df['rental_price'] + data_load_df['sales_volume']) / data_load_df['area_sq_km']
data_load_df['age_dependency_ratio'] = (data_load_df['age_0_20'] + data_load_df['age_60_plus']) / (data_load_df['age_20_40'] + data_load_df['age_40_60'])
data_load_df['deprivation_reduction_potential'] = data_load_df['qualification_index_score'] / data_load_df['deprivation_average_score']

# redefining the desired order of columns for the DataFrame to ensure consistency and structural view
ordered_columns = [
    'district', 'region_code', 'region_name', 'local_authority_code', 'local_authority_name', 
    'transfer_month_year', 'date', 'month', 'quarter', 'year', 'town_city', 'county', 
    'area_sq_km', 'all_ages', 'male_population', 'female_population', 
    'age_0_20', 'age_20_40', 'age_40_60', 'age_60_plus', 'age_dependency_ratio', 'property_type', 
    'duration', 'price', 'predicted_price_unscaled', 'average_price', 'average_price_log', 
    'average_price_pct_change', 'annual_change_percent', 'detached_price', 'semi_detached_price', 
    'terraced_price', 'flat_price', 'detached_price_log', 'semi_detached_price_log', 
    'terraced_price_log', 'flat_price_log', 'detached_price_pct_change', 
    'semi_detached_price_pct_change', 'terraced_price_pct_change', 'flat_price_pct_change', 
    'detached_semi_detached_ratio', 'detached_terraced_ratio', 'detached_flat_ratio', 
    'sales_volume', 'sales_volume_log', 'old_sales_volume', 'old_price', 'new_price', 
    'ftb_price', 'foo_price', 'cash_price', 'mortgage_price', 'index', 'rental_price', 
    'one_bedroom_rent', 'two_bedrooms_rent', 'three_bedrooms_rent', 'four_or_more_bedrooms_rent', 
    'all_categories_rent', 'qualification_index_score', 'qualification_index_rank', 
    'no_qualifications', 'level_1_and_entry_level_qualifications', 'level_2_qualifications', 
    'apprenticeship', 'level_3_qualifications', 'level_4_qualifications_and_above', 
    'other_qualifications', 'num_aged_16_plus_unemployed', 'num_aged_16_plus_employed', 
    'num_aged_16_plus_self_employed', 'est_num_households_with_child', 'gdhi', 'gdhi_per_capita', 
    'deprivation_average_score', 'deprivation_adjusted_gdhi', 'deprivation_employment_ratio', 
    'qualification_adjusted_employment_rate', 'deprivation_reduction_potential', 
    'housing_demand_indicator', 'buses_total', 'petrol_cars_total', 'petrol_lgv_total', 
    'hgv_total', 'lpg_lgv_total', 'hgv_motorways', 'personal_transport'
]
# Reordering the columns of the DataFrame according to the specified order
data_load_df = data_load_df[ordered_columns]

# Displaying the first few rows to verify that the column order is correct
print(data_load_df.head())

info_logger.info("Completed Feature addition")

     district region_code region_name local_authority_code  \
0  HARTLEPOOL   E12000001  North East            E06000001   
1  HARTLEPOOL   E12000001  North East            E06000001   
2  HARTLEPOOL   E12000001  North East            E06000001   
3  HARTLEPOOL   E12000001  North East            E06000001   
4  HARTLEPOOL   E12000001  North East            E06000001   

  local_authority_name transfer_month_year       date  month  quarter  year  \
0           HARTLEPOOL            Apr-2023 2023-04-01      4        2  2023   
1           HARTLEPOOL            Aug-2023 2023-08-01      8        3  2023   
2           HARTLEPOOL            Dec-2023 2023-12-01     12        4  2023   
3           HARTLEPOOL            Feb-2023 2023-02-01      2        1  2023   
4           HARTLEPOOL            Jan-2023 2023-01-01      1        1  2023   

   ... qualification_adjusted_employment_rate deprivation_reduction_potential  \
0  ...                           16183.181818                        0.

In [16]:
# Checking for null records and calculate the number of null records per column
null_counts = data_load_df.isnull().sum()

# Printing each column name along with its respective number of null records
for col, count in null_counts.items():
    print(f"Column: {col}, Number of Null Records: {count}")

# Calculating and printing the total number of null records across the entire DataFrame
total_null_records = data_load_df.isnull().sum().sum()
print(f"Total Number of Null Records in DataFrame: {total_null_records}")


Column: district, Number of Null Records: 0
Column: region_code, Number of Null Records: 0
Column: region_name, Number of Null Records: 0
Column: local_authority_code, Number of Null Records: 0
Column: local_authority_name, Number of Null Records: 0
Column: transfer_month_year, Number of Null Records: 0
Column: date, Number of Null Records: 0
Column: month, Number of Null Records: 0
Column: quarter, Number of Null Records: 0
Column: year, Number of Null Records: 0
Column: town_city, Number of Null Records: 0
Column: county, Number of Null Records: 0
Column: area_sq_km, Number of Null Records: 0
Column: all_ages, Number of Null Records: 0
Column: male_population, Number of Null Records: 0
Column: female_population, Number of Null Records: 0
Column: age_0_20, Number of Null Records: 0
Column: age_20_40, Number of Null Records: 0
Column: age_40_60, Number of Null Records: 0
Column: age_60_plus, Number of Null Records: 0
Column: age_dependency_ratio, Number of Null Records: 0
Column: prope

In [17]:
# Saving the DataFrame to an Excel file in the specified output directory to be utilized during Loading phase
data_load_df.to_excel("../Data/Output/Processed_data_load_df.xlsx", index=False)

print("Data saved to 'Processed_data_load_df.xlsx'")

Data saved to 'Processed_data_load_df.xlsx'


In [18]:
# Saving the DataFrame containing the final selected features to an Excel file to be utilized during Loading phase
final_selected_features_details.to_excel("../Data/Output/processed_selected_features.xlsx", index=False)

# Printing confirmation that the features data has been successfully saved
print("Data saved to 'processed_selected_features.xlsx'")

info_logger.info("Completed preprocessing")

Data saved to 'processed_selected_features.xlsx'
