In [None]:
import polars as pl

In [None]:
df = pl.read_parquet("data/pp_data_man.parquet")
df2 = pl.read_parquet("data/pc_man.parquet")

In [None]:
df.head(5)

In [None]:
df2.head(5)

In [None]:
df.columns = [
    "id", "price", "date", "postcode", "property_type", "new", "duration",
    "paon", "saon", "street", "locality", "town_city", "district", "county"
]
df2 = df2.rename({"pcds": "postcode", "lat": "latitude", "long": "longitude"})

In [None]:
df.select(pl.col("saon")).head(10)


In [None]:
df.null_count()

In [None]:

df2.null_count()

In [None]:
df.schema

In [None]:
df2.schema

In [None]:
df = df.with_columns(
    pl.col("postcode")
      .cast(pl.Utf8)
      .str.strip_chars()
      .str.to_uppercase()
      .str.replace_all(r"\s+", " ")
      .alias("postcode")
)

df2 = df2.with_columns(
    pl.col("postcode")
      .cast(pl.Utf8)
      .str.strip_chars()
      .str.to_uppercase()
      .str.replace_all(r"\s+", " ")
      .alias("postcode")
)

In [None]:
df = df.join(
    df2.select("postcode", "latitude", "longitude"),
    on="postcode",
    how="left"
)

In [None]:
string_cols = [c for c, t in df.schema.items() if t == pl.Utf8]
df = df.with_columns([
    pl.when(pl.col(c) == "").then(None).otherwise(pl.col(c)).alias(c) for c in string_cols
    ])

In [None]:
df.null_count()


In [None]:
df = df.with_columns(
    pl.when(pl.col("saon").is_not_null() & (pl.col("saon") != ""))
      .then(pl.concat_str([pl.col("paon"), pl.col("saon")], separator=", "))
      .otherwise(pl.col("paon"))
      .alias("paon")
).drop("saon")



In [None]:
null_paon = df.filter(pl.col("street").is_null())
print(null_paon)


In [None]:
df = df.drop_nulls(subset=["postcode"])
df = df.drop("locality")


In [None]:
df = df.with_columns(
    pl.col("postcode")
      .str.strip_chars()
      .str.to_uppercase()
      .str.replace_all(r"\s+", " ")
      .alias("postcode")
)



In [None]:
df.head(5)

In [None]:
df = df.with_columns(
    pl.col("date").str.slice(0, 10).str.strptime(pl.Date, format="%Y-%m-%d", strict=True).alias("date")
)



In [None]:
df = df.with_columns(
    pl.col("price")
      .str.replace_all(",", "")
      .cast(pl.Float64)
      .alias("price")
)



In [None]:
df = df.with_columns([
    pl.col("date").dt.year().alias("sale_year"),
    pl.col("date").dt.month().alias("sale_month"),
    pl.col("date").dt.quarter().alias("sale_quarter")
])

In [None]:
df = df.with_columns(
    pl.col("postcode")
    .str.strip_chars()
    .str.to_uppercase()
    .alias("postcode")
)

In [None]:
df_non_null_street = df.filter(pl.col("street").is_not_null())

In [None]:
postcode_mode_list = (
    df_non_null_street
    .group_by("postcode")
    .agg(
        pl.col("street").mode().alias("street_modes")
    )
)

In [None]:
postcode_fill = postcode_mode_list.with_columns(
    pl.col("street_modes").list.first().alias("imputed_street")
).drop("street_modes")

In [None]:
df_joined = df.join(postcode_fill, on="postcode", how="left")

In [None]:
df_filled = df_joined.with_columns(
    pl.when(pl.col("street").is_null())
      .then(pl.col("imputed_street"))
      .otherwise(pl.col("street"))
      .alias("street")
).drop("imputed_street")

In [None]:
df_filled.null_count()

In [None]:
df_filled = df_filled.with_columns([
    pl.col("paon").fill_null("N/A"),
    pl.col("street").fill_null("N/A")
])

In [None]:
categorical_cols = ["property_type", "duration", "new"]

for col in categorical_cols:
    if col in df_filled.columns:
        df_filled = df_filled.with_columns(
            pl.col(col)
            .str.strip_chars()
            .str.to_lowercase()
            .cast(pl.Categorical)
            .alias(col)
        )

In [None]:
df_filled.head(5)

In [None]:
df_filled.schema