# Import Dependencies 

In [1]:
# Data Management
import polars as pl
import numpy as np
from pandas_datareader.data import DataReader
from ta import add_all_ta_features

#Statistics
from statsmodels.tsa.stattools import adfuller

#Data Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

# Supervised Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.ensemble import RandomForestRegressor

#Reporting
import matplotlib.pyplot as plt

# Data Ingestion 

In [2]:
# Set the file path
file_path = "/Users/okitrader/OneDrive/py_crypto_stock/SydneyHousePrices.csv"

# Read the CSV file
df = pl.read_csv(file_path)

# Display the first 5 rows
print(f'Length of Data: {len(df)}')
print(df.head())


Length of Data: 199504
shape: (5, 9)
┌────────────┬─────┬──────────────┬────────────┬───┬─────┬──────┬─────┬──────────┐
│ Date       ┆ Id  ┆ suburb       ┆ postalCode ┆ … ┆ bed ┆ bath ┆ car ┆ propType │
│ ---        ┆ --- ┆ ---          ┆ ---        ┆   ┆ --- ┆ ---  ┆ --- ┆ ---      │
│ str        ┆ i64 ┆ str          ┆ i64        ┆   ┆ f64 ┆ i64  ┆ f64 ┆ str      │
╞════════════╪═════╪══════════════╪════════════╪═══╪═════╪══════╪═════╪══════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 4.0 ┆ 2    ┆ 2.0 ┆ house    │
│ 2019-06-13 ┆ 2   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 4.0 ┆ 3    ┆ 4.0 ┆ house    │
│ 2019-06-07 ┆ 3   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 3.0 ┆ 3    ┆ 2.0 ┆ house    │
│ 2019-05-28 ┆ 4   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 3.0 ┆ 1    ┆ 2.0 ┆ house    │
│ 2019-05-22 ┆ 5   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 5.0 ┆ 4    ┆ 4.0 ┆ house    │
└────────────┴─────┴──────────────┴────────────┴───┴─────┴──────┴─────┴──────────┘


In [3]:


print(df.columns)

['Date', 'Id', 'suburb', 'postalCode', 'sellPrice', 'bed', 'bath', 'car', 'propType']


In [4]:
# Display DataFrame schema
print("Schema:")
print(df.schema)

# Display number of rows and columns
print("\nNumber of rows:", df.height)
print("Number of columns:", df.width)

# Display null counts for each column
print("\nNull counts for each column:")
null_counts = df.select([pl.col(col).is_null().sum().alias(f"{col}_null_count") for col in df.columns])
print(null_counts)


Schema:
OrderedDict({'Date': String, 'Id': Int64, 'suburb': String, 'postalCode': Int64, 'sellPrice': Int64, 'bed': Float64, 'bath': Int64, 'car': Float64, 'propType': String})

Number of rows: 199504
Number of columns: 9

Null counts for each column:
shape: (1, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date_null ┆ Id_null_c ┆ suburb_nu ┆ postalCod ┆ … ┆ bed_null_ ┆ bath_null ┆ car_null_ ┆ propType │
│ _count    ┆ ount      ┆ ll_count  ┆ e_null_co ┆   ┆ count     ┆ _count    ┆ count     ┆ _null_co │
│ ---       ┆ ---       ┆ ---       ┆ unt       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ unt      │
│ u32       ┆ u32       ┆ u32       ┆ ---       ┆   ┆ u32       ┆ u32       ┆ u32       ┆ ---      │
│           ┆           ┆           ┆ u32       ┆   ┆           ┆           ┆           ┆ u32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0         ┆ 0         ┆ 0

# Feature Engineering - Common Tasks

## Handle Non-Numerical Data

In [5]:
# Count unique items for 'suburb'
suburb_text_unique = df['suburb'].unique()
suburb_text_unique_list = suburb_text_unique.to_list() # prints the full list for viewing
print('Unique Suburbs: ', len(suburb_text_unique))
print('Preform label encoding')

Unique Suburbs:  685
Preform label encoding


In [6]:
# Count unique items for propType
prop_type_text_unique = df['propType'].unique()
print('Unique Prop Types: ', len(prop_type_text_unique))
print('Preform OneHotEncoding encoding')

Unique Prop Types:  8
Preform OneHotEncoding encoding


In [7]:
# Label Encoding for 'suburb'
labelencoder = LabelEncoder()

# Perform label encoding on the 'suburb' column to convert categorical text data into numerical values
# The LabelEncoder's fit_transform method fits the encoder and returns the transformed values as a NumPy array
# This step is necessary for machine learning models which require numerical input data
encoded_suburbs = labelencoder.fit_transform(df['suburb'].to_numpy())


# Add the encoded column to the DataFrame
df = df.with_columns(pl.Series(encoded_suburbs, dtype=pl.UInt32).alias("suburbs_encoded"))

# Display the first 5 rows after encoding
print(df.head())


shape: (5, 10)
┌────────────┬─────┬──────────────┬────────────┬───┬──────┬─────┬──────────┬─────────────────┐
│ Date       ┆ Id  ┆ suburb       ┆ postalCode ┆ … ┆ bath ┆ car ┆ propType ┆ suburbs_encoded │
│ ---        ┆ --- ┆ ---          ┆ ---        ┆   ┆ ---  ┆ --- ┆ ---      ┆ ---             │
│ str        ┆ i64 ┆ str          ┆ i64        ┆   ┆ i64  ┆ f64 ┆ str      ┆ u32             │
╞════════════╪═════╪══════════════╪════════════╪═══╪══════╪═════╪══════════╪═════════════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 2    ┆ 2.0 ┆ house    ┆ 22              │
│ 2019-06-13 ┆ 2   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 3    ┆ 4.0 ┆ house    ┆ 22              │
│ 2019-06-07 ┆ 3   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 3    ┆ 2.0 ┆ house    ┆ 654             │
│ 2019-05-28 ┆ 4   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 1    ┆ 2.0 ┆ house    ┆ 22              │
│ 2019-05-22 ┆ 5   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 4    ┆ 4.0 ┆ house    ┆ 654             │
└────────────┴─────┴──────────────┴

In [9]:
# One-Hot Encoding for 'propType' using Polars
oneshot_encoded = df.with_columns([
    pl.when(pl.col('propType') == pt).then(1).otherwise(0).alias(f'pt_{pt}')
    for pt in df['propType'].cast(pl.Categorical).unique()
])

# Drop the original 'propType' column
oneshot_encoded = oneshot_encoded.drop('propType')

# Display the first 5 rows after one-hot encoding
print(oneshot_encoded.head())

# Display the list of columns to verify one-hot encoding
print("\nColumns after one-hot encoding:")
print(oneshot_encoded.columns)

# Check the unique values in one of the new columns
pt_columns = [col for col in oneshot_encoded.columns if col.startswith('pt_')]
if pt_columns:
    first_pt_column = pt_columns[0]
    print(f"\nUnique values in '{first_pt_column}' column:")
    print(oneshot_encoded[first_pt_column].unique())
else:
    print("\nNo 'pt_' columns found. One-hot encoding may have failed.")

shape: (5, 17)
┌────────────┬─────┬────────┬────────────┬───┬──────────┬────────────┬────────────────┬────────────┐
│ Date       ┆ Id  ┆ suburb ┆ postalCode ┆ … ┆ pt_other ┆ pt_terrace ┆ pt_warehouse   ┆ pt_acreage │
│ ---        ┆ --- ┆ ---    ┆ ---        ┆   ┆ ---      ┆ ---        ┆ ---            ┆ ---        │
│ str        ┆ i64 ┆ str    ┆ i64        ┆   ┆ i32      ┆ i32        ┆ i32            ┆ i32        │
╞════════════╪═════╪════════╪════════════╪═══╪══════════╪════════════╪════════════════╪════════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon ┆ 2107       ┆ … ┆ 0        ┆ 0          ┆ 0              ┆ 0          │
│            ┆     ┆ Beach  ┆            ┆   ┆          ┆            ┆                ┆            │
│ 2019-06-13 ┆ 2   ┆ Avalon ┆ 2107       ┆ … ┆ 0        ┆ 0          ┆ 0              ┆ 0          │
│            ┆     ┆ Beach  ┆            ┆   ┆          ┆            ┆                ┆            │
│ 2019-06-07 ┆ 3   ┆ Whale  ┆ 2107       ┆ … ┆ 0        ┆ 0          ┆ 0    

In [14]:
df = df.join(oneshot_encoded, on='Id')

In [15]:
df.head()

Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburbs_encoded,Date_right,suburb_right,postalCode_right,sellPrice_right,bed_right,bath_right,car_right,suburbs_encoded_right,pt_house,pt_townhouse,pt_duplex/semi-detached,pt_villa,pt_other,pt_terrace,pt_warehouse,pt_acreage
str,i64,str,i64,i64,f64,i64,f64,str,u32,str,str,i64,i64,f64,i64,f64,u32,i32,i32,i32,i32,i32,i32,i32,i32
"""2019-06-19""",1,"""Avalon Beach""",2107,1210000,4.0,2,2.0,"""house""",22,"""2019-06-19""","""Avalon Beach""",2107,1210000,4.0,2,2.0,22,1,0,0,0,0,0,0,0
"""2019-06-13""",2,"""Avalon Beach""",2107,2250000,4.0,3,4.0,"""house""",22,"""2019-06-13""","""Avalon Beach""",2107,2250000,4.0,3,4.0,22,1,0,0,0,0,0,0,0
"""2019-06-07""",3,"""Whale Beach""",2107,2920000,3.0,3,2.0,"""house""",654,"""2019-06-07""","""Whale Beach""",2107,2920000,3.0,3,2.0,654,1,0,0,0,0,0,0,0
"""2019-05-28""",4,"""Avalon Beach""",2107,1530000,3.0,1,2.0,"""house""",22,"""2019-05-28""","""Avalon Beach""",2107,1530000,3.0,1,2.0,22,1,0,0,0,0,0,0,0
"""2019-05-22""",5,"""Whale Beach""",2107,8000000,5.0,4,4.0,"""house""",654,"""2019-05-22""","""Whale Beach""",2107,8000000,5.0,4,4.0,654,1,0,0,0,0,0,0,0


## Set Target

In [17]:
# Create a new column 'TARGET' that's a copy of 'sellPrice'
df = df.with_columns(pl.col('sellPrice').alias('TARGET'))
print(df.head())

shape: (5, 27)
┌────────────┬─────┬────────┬────────────┬───┬────────────┬──────────────┬────────────┬─────────┐
│ Date       ┆ Id  ┆ suburb ┆ postalCode ┆ … ┆ pt_terrace ┆ pt_warehouse ┆ pt_acreage ┆ TARGET  │
│ ---        ┆ --- ┆ ---    ┆ ---        ┆   ┆ ---        ┆ ---          ┆ ---        ┆ ---     │
│ str        ┆ i64 ┆ str    ┆ i64        ┆   ┆ i32        ┆ i32          ┆ i32        ┆ i64     │
╞════════════╪═════╪════════╪════════════╪═══╪════════════╪══════════════╪════════════╪═════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon ┆ 2107       ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 1210000 │
│            ┆     ┆ Beach  ┆            ┆   ┆            ┆              ┆            ┆         │
│ 2019-06-13 ┆ 2   ┆ Avalon ┆ 2107       ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2250000 │
│            ┆     ┆ Beach  ┆            ┆   ┆            ┆              ┆            ┆         │
│ 2019-06-07 ┆ 3   ┆ Whale  ┆ 2107       ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2920000 │
│    

In [18]:
print(df.columns)

['Date', 'Id', 'suburb', 'postalCode', 'sellPrice', 'bed', 'bath', 'car', 'propType', 'suburbs_encoded', 'Date_right', 'suburb_right', 'postalCode_right', 'sellPrice_right', 'bed_right', 'bath_right', 'car_right', 'suburbs_encoded_right', 'pt_house', 'pt_townhouse', 'pt_duplex/semi-detached', 'pt_villa', 'pt_other', 'pt_terrace', 'pt_warehouse', 'pt_acreage', 'TARGET']


## Remove Redundant Features

In [19]:
# Create a copy of the DataFrame (Polars handles this internally)
df_drop = df.clone()

# Drop the specified columns
columns_to_remove = ["Date", "Id", "suburb", "propType", "sellPrice"]
df_drop = df_drop.drop(columns=columns_to_remove)

# Display the first 5 rows after dropping the columns
print(df_drop.head())


shape: (5, 22)
┌────────────┬─────┬──────┬─────┬───┬────────────┬──────────────┬────────────┬─────────┐
│ postalCode ┆ bed ┆ bath ┆ car ┆ … ┆ pt_terrace ┆ pt_warehouse ┆ pt_acreage ┆ TARGET  │
│ ---        ┆ --- ┆ ---  ┆ --- ┆   ┆ ---        ┆ ---          ┆ ---        ┆ ---     │
│ i64        ┆ f64 ┆ i64  ┆ f64 ┆   ┆ i32        ┆ i32          ┆ i32        ┆ i64     │
╞════════════╪═════╪══════╪═════╪═══╪════════════╪══════════════╪════════════╪═════════╡
│ 2107       ┆ 4.0 ┆ 2    ┆ 2.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 1210000 │
│ 2107       ┆ 4.0 ┆ 3    ┆ 4.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2250000 │
│ 2107       ┆ 3.0 ┆ 3    ┆ 2.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2920000 │
│ 2107       ┆ 3.0 ┆ 1    ┆ 2.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 1530000 │
│ 2107       ┆ 5.0 ┆ 4    ┆ 4.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 8000000 │
└────────────┴─────┴──────┴─────┴───┴────────────┴──────────────┴────────────┴─────────┘


  df_drop = df_drop.drop(columns=columns_to_remove)


## Check null or inf values

In [35]:
import numpy as np

# Check for Null values across the DataFrame
is_null = df.with_columns(pl.all().is_null().any()).sum(axis=1).sum() > 0
print("Is Null: ", is_null)



Is Null:  True


  is_null = df.with_columns(pl.all().is_null().any()).sum(axis=1).sum() > 0


In [32]:
import polars as pl

# Assuming 'df' is your existing Polars DataFrame
# Check for Null values
contains_null = df.select(pl.any_horizontal(pl.all().is_null().any())).item()

# Print the result for Null values
print("Is Null:", contains_null)


Is Null: True


In [33]:
import polars as pl

# Check for Infinite values, only applying it to numeric columns
#this will error if you try to run on string columns
contains_inf = df.select(
    pl.any_horizontal(
        pl.col(pl.Float64).is_infinite().any()
    )
).item()

# Print the result for Infinite values
print("Is Inf:", contains_inf)


Is Inf: False


In [40]:
# Calculate means for numeric columns
numeric_cols = df.select(pl.col(pl.NUMERIC_DTYPES)).columns
column_means = df.select([pl.col(col).mean() for col in numeric_cols])

# Fill NA values with means for numeric columns
df_filled = df.with_columns([
    pl.col(col).fill_null(column_means.get_column(col)[0])
    for col in numeric_cols
])

print(df_filled.head())

shape: (5, 27)
┌────────────┬─────┬─────────────┬────────────┬───┬────────────┬─────────────┬────────────┬────────┐
│ Date       ┆ Id  ┆ suburb      ┆ postalCode ┆ … ┆ pt_terrace ┆ pt_warehous ┆ pt_acreage ┆ TARGET │
│ ---        ┆ --- ┆ ---         ┆ ---        ┆   ┆ ---        ┆ e           ┆ ---        ┆ ---    │
│ str        ┆ f64 ┆ str         ┆ f64        ┆   ┆ f64        ┆ ---         ┆ f64        ┆ f64    │
│            ┆     ┆             ┆            ┆   ┆            ┆ f64         ┆            ┆        │
╞════════════╪═════╪═════════════╪════════════╪═══╪════════════╪═════════════╪════════════╪════════╡
│ 2019-06-19 ┆ 1.0 ┆ Avalon      ┆ 2107.0     ┆ … ┆ 0.0        ┆ 0.0         ┆ 0.0        ┆ 1.21e6 │
│            ┆     ┆ Beach       ┆            ┆   ┆            ┆             ┆            ┆        │
│ 2019-06-13 ┆ 2.0 ┆ Avalon      ┆ 2107.0     ┆ … ┆ 0.0        ┆ 0.0         ┆ 0.0        ┆ 2.25e6 │
│            ┆     ┆ Beach       ┆            ┆   ┆            ┆            

In [41]:
import polars as pl

# Assuming 'df' is your existing Polars DataFrame
# Check for Null values
contains_null = df_filled.select(pl.any_horizontal(pl.all().is_null().any())).item()

# Print the result for Null values
print("Is Null:", contains_null)


Is Null: False


## Feature Scaling - Min Max Scaling

In [44]:

# Filtering out non-numeric columns to avoid conversion errors
# In this example, 'date' column is assumed to be the non-numeric field that needs to be excluded
numeric_df = df.select(pl.col("*").exclude(["date"]))  # Replace "date" with the actual name of your non-numeric column

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Convert the numeric DataFrame to a numpy array, apply scaling, and convert back to DataFrame
# This step involves only numeric columns
scaled_data = scaler.fit_transform(numeric_df.to_numpy())
df_scaled = pl.DataFrame(scaled_data, columns=numeric_df.columns)

# Adding back the excluded column if needed
# If you need to retain the non-numeric data alongside the scaled numeric data, you can join it back
df_scaled = df_scaled.with_column(df.select("date"))  # Adjust this if you have more than one non-numeric column

# Display the scaled DataFrame
print(df_scaled.head())


ValueError: could not convert string to float: '2019-06-19'