## Data Preprocessing

In [45]:
### Load Libraries
import pandas as pd
import numpy as np

In [46]:
# load merged dataset
file_path = "merged_dataset.csv" 
merged_df = pd.read_csv(file_path)

In [47]:
# Diplay data info
print("Initial Dataset Overview:")
print(merged_df.info())
print("\nMissing Values per Column:")
print(merged_df.isnull().sum())

Initial Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 26 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Organisation Name                              326 non-null    object 
 1   Effective Date                                 326 non-null    object 
 2   Property ID                                    326 non-null    int64  
 3   Property Type                                  326 non-null    object 
 4   Property Name                                  326 non-null    object 
 5   Property Name/Address (Where no UPRN)          317 non-null    object 
 6   Street Number                                  93 non-null     float64
 7   Street                                         220 non-null    object 
 8   Town / Post Town                               220 non-null    object 
 9   Postcode                    

#### 1. Handling Missing Values

In [48]:
# Fill missing numerical values with median
num_cols = ['Building Size - GIA (M2)', 'New Price']
for col in num_cols:
    if merged_df[col].notna().sum() > 0:  # Check if column has non-null values
        median_value = merged_df[col].median()
        merged_df[col] = merged_df[col].fillna(median_value)  # Avoid inplace=True
        print(f"Filled missing values in {col} with median: {median_value}")
    else:
        print(f"Skipping {col} as it has no non-null values to calculate the median.")

Filled missing values in Building Size - GIA (M2) with median: 95.0
Skipping New Price as it has no non-null values to calculate the median.


In [49]:
# Fill missing categorical values with 'Unknown'
cat_cols = ['Tenure Type', 'Holding Type', 'Property Type']
for col in cat_cols:
    merged_df[col] = merged_df[col].fillna('Unknown')  # Avoid inplace=True
    print(f"Filled missing values in {col} with 'Unknown'")

Filled missing values in Tenure Type with 'Unknown'
Filled missing values in Holding Type with 'Unknown'
Filled missing values in Property Type with 'Unknown'


#### 2. Data Cleaning & Formatting

In [50]:
# Convert 'Effective Date' to datetime format
merged_df['Effective Date'] = pd.to_datetime(merged_df['Effective Date'], errors='coerce')
print(f"Converted 'Effective Date' to datetime format. {merged_df['Effective Date'].isna().sum()} invalid entries were coerced to NaT.")

Converted 'Effective Date' to datetime format. 25 invalid entries were coerced to NaT.


In [51]:
# Standardize text formatting for categorical columns
cat_cols = ['Tenure Type', 'Holding Type', 'Property Type']
for col in cat_cols:
    merged_df[col] = merged_df[col].str.title().str.strip()
    print(f"Standardized text formatting for '{col}'. Unique values after formatting: {merged_df[col].nunique()}")

Standardized text formatting for 'Tenure Type'. Unique values after formatting: 7
Standardized text formatting for 'Holding Type'. Unique values after formatting: 6
Standardized text formatting for 'Property Type'. Unique values after formatting: 29


In [52]:
# Remove duplicate entries based on 'Property ID'
initial_rows = merged_df.shape[0]
merged_df.drop_duplicates(subset=['Property ID'], inplace=True)
removed_rows = initial_rows - merged_df.shape[0]
print(f"Removed {removed_rows} duplicate entries based on 'Property ID'. New total row count: {merged_df.shape[0]}.")

Removed 0 duplicate entries based on 'Property ID'. New total row count: 326.


#### 3. Feature Engineering

In [53]:
# Extract Year from 'Effective Date'
merged_df['Effective Year'] = merged_df['Effective Date'].dt.year
print(f"Extracted 'Effective Year' from 'Effective Date'. Number of unique years: {merged_df['Effective Year'].nunique()}")

Extracted 'Effective Year' from 'Effective Date'. Number of unique years: 2


In [54]:
# Calculate 'Price Per Sq Meter' (Handle Zero-Division)
merged_df['Price Per Sq Meter'] = merged_df['Base Price'] / merged_df['Building Size - GIA (M2)']
merged_df['Price Per Sq Meter'] = merged_df['Price Per Sq Meter'].replace([np.inf, -np.inf], np.nan)
merged_df['Price Per Sq Meter'] = merged_df['Price Per Sq Meter'].fillna(0)

# Output the number of zero or missing values replaced
print(f"Calculated 'Price Per Sq Meter'. Number of zero or missing values replaced: {merged_df['Price Per Sq Meter'].isna().sum()}")


Calculated 'Price Per Sq Meter'. Number of zero or missing values replaced: 0


In [55]:
print(merged_df.columns)

Index(['Organisation Name', 'Effective Date', 'Property ID', 'Property Type',
       'Property Name', 'Property Name/Address (Where no UPRN)',
       'Street Number', 'Street', 'Town / Post Town', 'Postcode', 'Ward',
       'Geo X (Easting)', 'Geo Y (Northing)', 'Tenure Type', 'Vacant',
       'Holding Type', 'Building Size - GIA (M2)', 'Site Area (Hectares)',
       'Occupied by Council / Direct Service Property',
       'Purpose / Asset Category', 'EPC Rating', 'Council Tax', 'Key Features',
       'Property Rent/Sale History', 'Base Price', 'New Price',
       'Effective Year', 'Price Per Sq Meter'],
      dtype='object')


In [56]:
# Create 'Property History Count' based on past records
merged_df['Property History Count'] = merged_df['Property Rent/Sale History'].apply(lambda x: len(str(x).split(',')) if pd.notna(x) else 0)
print(f"Created 'Property History Count' feature. Number of properties with no history: {merged_df['Property History Count'].eq(0).sum()}")

Created 'Property History Count' feature. Number of properties with no history: 0


In [57]:
# Categorize EPC Ratings into 'High', 'Medium', 'Low'
epc_mapping = {'A': 'High', 'B': 'High', 'C': 'Medium', 'D': 'Medium', 'E': 'Low', 'F': 'Low', 'G': 'Low'}
merged_df['EPC Category'] = merged_df['EPC Rating'].map(epc_mapping).fillna('Unknown')
print(f"Categorized EPC Ratings into 'High', 'Medium', 'Low'. Number of 'Unknown' EPC Ratings: {merged_df['EPC Category'].eq('Unknown').sum()}")

Categorized EPC Ratings into 'High', 'Medium', 'Low'. Number of 'Unknown' EPC Ratings: 187


#### 4. Save Cleaned Dataset

In [58]:
cleaned_file_path = "cleaned_dataset.csv"
merged_df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved successfully at {cleaned_file_path}")

# Display final dataset info
print("\nFinal Dataset Overview:")
print(merged_df.info())

Cleaned dataset saved successfully at cleaned_dataset.csv

Final Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 30 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   Organisation Name                              326 non-null    object        
 1   Effective Date                                 301 non-null    datetime64[ns]
 2   Property ID                                    326 non-null    int64         
 3   Property Type                                  326 non-null    object        
 4   Property Name                                  326 non-null    object        
 5   Property Name/Address (Where no UPRN)          317 non-null    object        
 6   Street Number                                  93 non-null     float64       
 7   Street                                         220 non-null 