In [None]:
import os
import pandas as pd
import boto3
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [None]:
# Step 1: Load CSV from S3

# Define the S3 URI for the original CSV dataset
s3_csv_uri = 's3://bucketjdrygalska/price_paid_records.csv'

# Load the CSV file from S3
print("Loading CSV from S3...")
df = pd.read_csv(s3_csv_uri)
print("CSV loaded successfully!")

# Remove the first 70% of rows
percent_to_remove = 70
rows_to_remove = int(len(df) * (percent_to_remove / 100))
df_subset = df.iloc[rows_to_remove:]
print(f"Removed the first {percent_to_remove}% of rows. Remaining rows: {len(df_subset)}")

# Define the local path to save files inside the UK Housing directory
local_path = './datasets/'

# Ensure the local path exists
if not os.path.exists(local_path):
    os.makedirs(local_path)

# Save the subset DataFrame to CSV format (locally, inside UK Housing directory)
local_csv_path = local_path + 'price_paid_records_subset.csv'
print("Saving subset dataset as CSV locally...")
df_subset.to_csv(local_csv_path, index=False)
print("Subset dataset saved as CSV locally successfully!")


In [10]:
# Step 2: Load the Optimized Subset Dataset from Local Storage for Faster Access

# Load from CSV format (directly from local directory)
print("Loading subset dataset from CSV...")
df = pd.read_csv(local_csv_path)
print("Subset dataset loaded from CSV successfully!")

# Optimize data types before further processing
# Convert object columns to category to save memory
for column in df.select_dtypes(include='object').columns:
    df[column] = df[column].astype('category')

# Convert float64 columns to float32 to reduce memory usage
for column in df.select_dtypes(include='float64').columns:
    df[column] = df[column].astype('float32')

# The dataset is now ready for further analysis or modeling
print("Optimized subset dataset is ready for further processing.")

Subset dataset loaded from CSV successfully!
Optimized subset dataset is ready for further processing.


In [11]:
# Step 3: Clean the Data

# Rename columns to remove spaces and make all lowercase
df.columns = df.columns.str.replace(' ', '_').str.lower()
print("Renamed columns to remove spaces and convert to lowercase.")

# Check for missing values (NaNs)
print("Checking for missing values...")
missing_values = df.isna().sum()
print(missing_values)

# Impute missing values using different techniques (mean, median, mode)
df_numeric = df.select_dtypes(include=['float64', 'int64'])
df_non_numeric = df.select_dtypes(exclude=['float64', 'int64'])

imputer_mean = SimpleImputer(strategy='mean')
# Impute missing values for numeric columns using mean
df_numeric_imputed = pd.DataFrame(imputer_mean.fit_transform(df_numeric), columns=df_numeric.columns)
print("Imputed missing values in numeric columns using the mean strategy.")

# Combine imputed numeric data with non-numeric data
df = pd.concat([df_numeric_imputed, df_non_numeric], axis=1)

# Handle categorical columns and label them as ordered if applicable
print("Handling categorical columns...")
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].astype('category')
    print(f"Converted column {column} to categorical type.")

# Check for ordered categoricals and label them
ordered_columns = ['duration']  # Example: Assuming 'duration' is an ordered categorical column
for column in ordered_columns:
    if column in df.columns:
        df[column] = df[column].cat.as_ordered()
        print(f"Labeled column {column} as ordered categorical.")


Renamed columns to remove spaces and convert to lowercase.
Checking for missing values...
transaction_unique_identifier        0
price                                0
date_of_transfer                     0
property_type                        0
old/new                              0
duration                             0
town/city                            0
district                             0
county                               0
ppdcategory_type                     0
record_status_-_monthly_file_only    0
dtype: int64
Imputed missing values in numeric columns using the mean strategy.
Handling categorical columns...
Labeled column duration as ordered categorical.
