In [1]:
import sys # Importing  Sys module to interact with the Python runtime environment
import numpy as np # Importing NumPy is used for numerical operations
import pandas as pd # Importing Pandas provides powerful data structures for data analysis
import joblib # Importing Joblib for saving and loading models efficiently
import psycopg2 # Importing PostgreSQL database adapter for Python
from contextlib import contextmanager  # Importing contextmanager for creating context managers
from sklearn.ensemble import RandomForestRegressor # Importing RandomForestRegressor for regression tasks
from sklearn.impute import SimpleImputer  # Importing SimpleImputer to handle missing values
from sklearn.model_selection import train_test_split # Importing train_test_split to split data into train and test sets
from sklearn.pipeline import Pipeline, FeatureUnion  # Importing Pipeline and FeatureUnion for creating machine learning workflows
from sklearn.compose import ColumnTransformer  # Importing ColumnTransformer for applying transformers to specific columns
from sklearn.preprocessing import (  
    RobustScaler,  # Importing RobustScaler for scaling features robust to outliers
    StandardScaler,  # Importing StandardScaler for standardizing features
    OrdinalEncoder,  # Importing OrdinalEncoder for encoding categorical features
    FunctionTransformer  # Importing FunctionTransformer for creating custom data transformations
)
# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

# Path adjustments for modules
sys.path.append('../Scripts')

# Import custom transformation modules
from data_transformations_1 import main_processing_pipeline
from data_transformations_2 import GroupMeanImputer, FeatureCreator, EncodingWithNames, ColumnTransformerDf, ColumnOrderTransformer
from data_transformations_3 import apply_imputations, prepare_data



### Preparation of default value list

In [2]:

def read_and_prepare_data(data_file_path):
    # Load the main dataset
    data_df = pd.read_excel(data_file_path)
    
    # Filtering numeric features
    numeric_features = data_df.select_dtypes(include=[np.number])
    # Calculating mean for numeric features
    numeric_defaults = numeric_features.mean().reset_index()
    numeric_defaults.columns = ['feature', 'default_value']

    # Filtering non-numeric features
    non_numeric_features = data_df.select_dtypes(exclude=[np.number])
    # Calculate mode for non-numeric features
    non_numeric_defaults = non_numeric_features.mode().iloc[0].reset_index()
    non_numeric_defaults.columns = ['feature', 'default_value']

    # Merging numeric and non-numeric defaults into one DataFrame
    # Using a dictionary to merge defaults by feature
    defaults_dict = {}
    
    for index, row in numeric_defaults.iterrows():
        defaults_dict[row['feature']] = row['default_value']
    
    for index, row in non_numeric_defaults.iterrows():
        if row['feature'] not in defaults_dict:
            defaults_dict[row['feature']] = row['default_value']
    
    # Converting the dictionary to a DataFrame
    combined_defaults = pd.DataFrame(list(defaults_dict.items()), columns=['feature', 'default_value'])
    # Defining the desired column order
    desired_order = [
        'District', 'Transfer Month-Year', 'Town/City', 'County', 'Price',
        'Property Type', 'Old/New', 'Duration', 'PPD Category Type', 'Record Status',
        'Region code', 'Region name', 'Local authority code', 'Local authority name',
        'Date', 'RegionName', 'AreaCode', 'AveragePrice', 'Index', '1m%Change',
        'SalesVolume', 'DetachedPrice', 'SemiDetachedPrice', 'TerracedPrice',
        'FlatPrice', 'CashPrice', 'MortgagePrice', 'MortgageIndex', 'FTBPrice',
        'FOOPrice', 'NewPrice', 'NewSalesVolume', 'OldPrice', 'OldSalesVolume',
        'Annual change (%)', 'Rental price (£)', 'One Bedroom Rent', 'Two Bedrooms Rent',
        'Three Bedrooms Rent', 'Four or more Bedrooms Rent', 'All categories Rent',
        'All ages', '0-20', '20-40', '40-60', '60+', 'Female population',
        'Male population', 'Area (sq km)', 'Qualification index score',
        'Qualification index rank (1 to 331)', 'No qualifications',
        'Level 1 and entry level qualifications', 'Level 2 qualifications', 'Apprenticeship',
        'Level 3 qualifications', 'Level 4 qualifications and above', 'Other qualifications',
        'Estimated number of households with at least 1 early-years or school age child',
        'Deprivation Average Score', 'Number of those aged 16+ who are unemployed',
        'Number of those aged 16+ in employment who are employees',
        'Number of those aged 16+ in employment who are self-employed', 'GDHI',
        'Number of Schools', 'Headcount of Pupils(school)', 'Buses total',
        'Diesel cars total', 'Petrol cars total', 'HGV - Motorways', 'HGV total',
        'Diesel LGV total', 'Petrol LGV total', 'LPG LGV total',
        'Personal transport (buses, cars and motorcycles)', 'Freight transport (HGV and LGV)',
        'Fuel consumption by all vehicles'
    ]

    # Ensuring the DataFrame has all columns in the desired order
    combined_defaults = combined_defaults.set_index('feature').reindex(desired_order).reset_index()
    
    return combined_defaults

# File paths
data_file_path = '../Data/Output/original_cleaned_df.xlsx'

# Preparing the data
default_values_df = read_and_prepare_data(data_file_path)

# Saving the combined default values to an Excel file
default_values_df.to_excel('../Data/Output/combined_default_values.xlsx', index=False)

# Displaying the combined default values DataFrame
print(default_values_df)


                                             feature    default_value
0                                           District             ADUR
1                                Transfer Month-Year         Jan-2024
2                                          Town/City           LONDON
3                                             County   GREATER LONDON
4                                              Price  26457690.957171
..                                               ...              ...
72                                  Petrol LGV total         0.594976
73                                     LPG LGV total         0.000306
74  Personal transport (buses, cars and motorcycles)         70.43712
75                   Freight transport (HGV and LGV)        41.595804
76                  Fuel consumption by all vehicles       112.032924

[77 rows x 2 columns]


### Execution of Price prediction suite

In [3]:

def predict_with_dynamic_features(input_features):
    # Loading default values
    data_defaults = pd.read_excel('../Data/Output/combined_default_values.xlsx')
    default_data = data_defaults.set_index('feature').T.to_dict('records')[0]
    
    # Updating defaults with dynamic values from input_features
    for key, value in input_features.items():
        if key in default_data:
            default_data[key] = value
    
    # Creating DataFrame from updated defaults
    updated_input_df = pd.DataFrame([default_data])
    
    # Preprocessing steps
    ETL_first_stage_pipeline = joblib.load('../Models/first_stage_pipeline.pkl')
    ETL_first_stage_output = ETL_first_stage_pipeline.transform(updated_input_df)
    ETL_second_stage_pipeline = joblib.load('../Models/second_stage_pipeline.pkl')
    ETL_second_stage_output = ETL_second_stage_pipeline.transform(ETL_first_stage_output)
    
    # Filling NaN values
    # Calculating fill values dynamically from the same dataset
    null_data_fix = pd.read_excel('../Data/Output/ETL_second_stage_output.xlsx')
    price_change_fields = ['AveragePrice_PctChange', 'DetachedPrice_PctChange', 'SemiDetachedPrice_PctChange', 'TerracedPrice_PctChange', 'FlatPrice_PctChange']
    fill_values = null_data_fix[price_change_fields].mean().to_dict()
    ETL_second_stage_output.fillna(value=fill_values, inplace=True)
    
    # Preparing data for prediction
    final_combined_features_df = pd.read_excel('../Data/Output/final_combined_features.xlsx')
    ETL_selected_features = final_combined_features_df['Feature'].tolist()
    ETL_X = ETL_second_stage_output[ETL_selected_features]
    
    # Loading model and making predictions
    rf_model = joblib.load('../Models/RandomForest_house_price_prediction_model.pkl')
    predictions = rf_model.predict(ETL_X)
    
    # Scaling
    pipeline = joblib.load('../Models/second_stage_pipeline.pkl')
    price_scaler = pipeline.named_steps['scale_normalize'].named_transformers_['robust_scaler_price']
    predictions_unscaled = price_scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()

    return predictions_unscaled[0]
while True:
    # Interactive input
    input_features = {
        'Level 4 qualifications and above': int(input("Enter 'Level 4 qualifications and above': ")),
        '20-40': int(input("Enter '20-40' population: ")),
        'Number of those aged 16+ who are unemployed': int(input("Enter 'Number of those aged 16+ who are unemployed': ")),   
        'HGV - Motorways': int(input("Enter 'HGV - Motorways': "))
    }

    result = predict_with_dynamic_features(input_features)

    # Displaying the result
    print(f"\033[1mPredicted Price: £{result:.2f}\033[0m")

    continue_predicting = input("\nDo you want to make another prediction? (y/n): ")
    if continue_predicting.lower() != 'y':
        break

Enter 'Level 4 qualifications and above':  100000
Enter '20-40' population:  120000
Enter 'Number of those aged 16+ who are unemployed':  10000
Enter 'HGV - Motorways':  200000


[1mPredicted Price: £46725795.49[0m



Do you want to make another prediction? (y/n):  y
Enter 'Level 4 qualifications and above':  110000
Enter '20-40' population:  110000
Enter 'Number of those aged 16+ who are unemployed':  5000
Enter 'HGV - Motorways':  5000


[1mPredicted Price: £45351683.09[0m



Do you want to make another prediction? (y/n):  y
Enter 'Level 4 qualifications and above':  1000
Enter '20-40' population:  1000
Enter 'Number of those aged 16+ who are unemployed':  100000
Enter 'HGV - Motorways':  100000


[1mPredicted Price: £15954600.08[0m



Do you want to make another prediction? (y/n):  n
