# Import Dependencies 

In [1]:
# Data Management
import polars as pl
import numpy as np
from pandas_datareader.data import DataReader
from ta import add_all_ta_features

#Statistics
from statsmodels.tsa.stattools import adfuller

#Data Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

# Supervised Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.ensemble import RandomForestRegressor

#Reporting
import matplotlib.pyplot as plt

# Data Ingestion 

In [2]:
# Set the file path
file_path = "/Users/okitrader/OneDrive/py_crypto_stock/SydneyHousePrices.csv"

# Read the CSV file
df = pl.read_csv(file_path)

# Display the first 5 rows
print(f'Length of Data: {len(df)}')
print(df.head())


Length of Data: 199504
shape: (5, 9)
┌────────────┬─────┬──────────────┬────────────┬───┬─────┬──────┬─────┬──────────┐
│ Date       ┆ Id  ┆ suburb       ┆ postalCode ┆ … ┆ bed ┆ bath ┆ car ┆ propType │
│ ---        ┆ --- ┆ ---          ┆ ---        ┆   ┆ --- ┆ ---  ┆ --- ┆ ---      │
│ str        ┆ i64 ┆ str          ┆ i64        ┆   ┆ f64 ┆ i64  ┆ f64 ┆ str      │
╞════════════╪═════╪══════════════╪════════════╪═══╪═════╪══════╪═════╪══════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 4.0 ┆ 2    ┆ 2.0 ┆ house    │
│ 2019-06-13 ┆ 2   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 4.0 ┆ 3    ┆ 4.0 ┆ house    │
│ 2019-06-07 ┆ 3   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 3.0 ┆ 3    ┆ 2.0 ┆ house    │
│ 2019-05-28 ┆ 4   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 3.0 ┆ 1    ┆ 2.0 ┆ house    │
│ 2019-05-22 ┆ 5   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 5.0 ┆ 4    ┆ 4.0 ┆ house    │
└────────────┴─────┴──────────────┴────────────┴───┴─────┴──────┴─────┴──────────┘


In [3]:


print(df.columns)

['Date', 'Id', 'suburb', 'postalCode', 'sellPrice', 'bed', 'bath', 'car', 'propType']


In [4]:
# Display DataFrame schema
print("Schema:")
print(df.schema)

# Display number of rows and columns
print("\nNumber of rows:", df.height)
print("Number of columns:", df.width)

# Display null counts for each column
print("\nNull counts for each column:")
null_counts = df.select([pl.col(col).is_null().sum().alias(f"{col}_null_count") for col in df.columns])
print(null_counts)


Schema:
OrderedDict({'Date': String, 'Id': Int64, 'suburb': String, 'postalCode': Int64, 'sellPrice': Int64, 'bed': Float64, 'bath': Int64, 'car': Float64, 'propType': String})

Number of rows: 199504
Number of columns: 9

Null counts for each column:
shape: (1, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date_null ┆ Id_null_c ┆ suburb_nu ┆ postalCod ┆ … ┆ bed_null_ ┆ bath_null ┆ car_null_ ┆ propType │
│ _count    ┆ ount      ┆ ll_count  ┆ e_null_co ┆   ┆ count     ┆ _count    ┆ count     ┆ _null_co │
│ ---       ┆ ---       ┆ ---       ┆ unt       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ unt      │
│ u32       ┆ u32       ┆ u32       ┆ ---       ┆   ┆ u32       ┆ u32       ┆ u32       ┆ ---      │
│           ┆           ┆           ┆ u32       ┆   ┆           ┆           ┆           ┆ u32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0         ┆ 0         ┆ 0

# Feature Engineering - Common Tasks

## Handle Non-Numerical Data

In [5]:
# Count unique items for 'suburb'
suburb_text_unique = df['suburb'].unique()
suburb_text_unique_list = suburb_text_unique.to_list() # prints the full list for viewing
print('Unique Suburbs: ', len(suburb_text_unique))
print('Preform label encoding')

Unique Suburbs:  685
Preform label encoding


In [6]:
# Count unique items for propType
prop_type_text_unique = df['propType'].unique()
print('Unique Prop Types: ', len(prop_type_text_unique))
print('Preform OneHotEncoding encoding')

Unique Prop Types:  8
Preform OneHotEncoding encoding


In [7]:
# Label Encoding for 'suburb'
labelencoder = LabelEncoder()

# Perform label encoding on the 'suburb' column to convert categorical text data into numerical values
# The LabelEncoder's fit_transform method fits the encoder and returns the transformed values as a NumPy array
# This step is necessary for machine learning models which require numerical input data
encoded_suburbs = labelencoder.fit_transform(df['suburb'].to_numpy())


# Add the encoded column to the DataFrame
df = df.with_columns(pl.Series(encoded_suburbs, dtype=pl.UInt32).alias("suburbs_encoded"))

# Display the first 5 rows after encoding
print(df.head())


shape: (5, 10)
┌────────────┬─────┬──────────────┬────────────┬───┬──────┬─────┬──────────┬─────────────────┐
│ Date       ┆ Id  ┆ suburb       ┆ postalCode ┆ … ┆ bath ┆ car ┆ propType ┆ suburbs_encoded │
│ ---        ┆ --- ┆ ---          ┆ ---        ┆   ┆ ---  ┆ --- ┆ ---      ┆ ---             │
│ str        ┆ i64 ┆ str          ┆ i64        ┆   ┆ i64  ┆ f64 ┆ str      ┆ u32             │
╞════════════╪═════╪══════════════╪════════════╪═══╪══════╪═════╪══════════╪═════════════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 2    ┆ 2.0 ┆ house    ┆ 22              │
│ 2019-06-13 ┆ 2   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 3    ┆ 4.0 ┆ house    ┆ 22              │
│ 2019-06-07 ┆ 3   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 3    ┆ 2.0 ┆ house    ┆ 654             │
│ 2019-05-28 ┆ 4   ┆ Avalon Beach ┆ 2107       ┆ … ┆ 1    ┆ 2.0 ┆ house    ┆ 22              │
│ 2019-05-22 ┆ 5   ┆ Whale Beach  ┆ 2107       ┆ … ┆ 4    ┆ 4.0 ┆ house    ┆ 654             │
└────────────┴─────┴──────────────┴

In [8]:
# One-Hot Encoding for 'propType' using Polars
oneshot_encoded = df.with_columns([
    pl.when(pl.col('propType') == pt).then(1).otherwise(0).alias(f'pt_{pt}')
    for pt in df['propType'].cast(pl.Categorical).unique()
])

# Drop the original 'propType' column
oneshot_encoded = oneshot_encoded.drop('propType')

# Display the first 5 rows after one-hot encoding
print(oneshot_encoded.head())

# Display the list of columns to verify one-hot encoding
print("\nColumns after one-hot encoding:")
print(oneshot_encoded.columns)

# Check the unique values in one of the new columns
pt_columns = [col for col in oneshot_encoded.columns if col.startswith('pt_')]
if pt_columns:
    first_pt_column = pt_columns[0]
    print(f"\nUnique values in '{first_pt_column}' column:")
    print(oneshot_encoded[first_pt_column].unique())
else:
    print("\nNo 'pt_' columns found. One-hot encoding may have failed.")

shape: (5, 17)
┌────────────┬─────┬────────┬────────────┬───┬──────────┬────────────┬────────────────┬────────────┐
│ Date       ┆ Id  ┆ suburb ┆ postalCode ┆ … ┆ pt_other ┆ pt_terrace ┆ pt_warehouse   ┆ pt_acreage │
│ ---        ┆ --- ┆ ---    ┆ ---        ┆   ┆ ---      ┆ ---        ┆ ---            ┆ ---        │
│ str        ┆ i64 ┆ str    ┆ i64        ┆   ┆ i32      ┆ i32        ┆ i32            ┆ i32        │
╞════════════╪═════╪════════╪════════════╪═══╪══════════╪════════════╪════════════════╪════════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon ┆ 2107       ┆ … ┆ 0        ┆ 0          ┆ 0              ┆ 0          │
│            ┆     ┆ Beach  ┆            ┆   ┆          ┆            ┆                ┆            │
│ 2019-06-13 ┆ 2   ┆ Avalon ┆ 2107       ┆ … ┆ 0        ┆ 0          ┆ 0              ┆ 0          │
│            ┆     ┆ Beach  ┆            ┆   ┆          ┆            ┆                ┆            │
│ 2019-06-07 ┆ 3   ┆ Whale  ┆ 2107       ┆ … ┆ 0        ┆ 0          ┆ 0    

In [9]:
df = df.join(oneshot_encoded, on='Id')

In [10]:
df.head()

Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburbs_encoded,Date_right,suburb_right,postalCode_right,sellPrice_right,bed_right,bath_right,car_right,suburbs_encoded_right,pt_house,pt_townhouse,pt_duplex/semi-detached,pt_villa,pt_other,pt_terrace,pt_warehouse,pt_acreage
str,i64,str,i64,i64,f64,i64,f64,str,u32,str,str,i64,i64,f64,i64,f64,u32,i32,i32,i32,i32,i32,i32,i32,i32
"""2019-06-19""",1,"""Avalon Beach""",2107,1210000,4.0,2,2.0,"""house""",22,"""2019-06-19""","""Avalon Beach""",2107,1210000,4.0,2,2.0,22,1,0,0,0,0,0,0,0
"""2019-06-13""",2,"""Avalon Beach""",2107,2250000,4.0,3,4.0,"""house""",22,"""2019-06-13""","""Avalon Beach""",2107,2250000,4.0,3,4.0,22,1,0,0,0,0,0,0,0
"""2019-06-07""",3,"""Whale Beach""",2107,2920000,3.0,3,2.0,"""house""",654,"""2019-06-07""","""Whale Beach""",2107,2920000,3.0,3,2.0,654,1,0,0,0,0,0,0,0
"""2019-05-28""",4,"""Avalon Beach""",2107,1530000,3.0,1,2.0,"""house""",22,"""2019-05-28""","""Avalon Beach""",2107,1530000,3.0,1,2.0,22,1,0,0,0,0,0,0,0
"""2019-05-22""",5,"""Whale Beach""",2107,8000000,5.0,4,4.0,"""house""",654,"""2019-05-22""","""Whale Beach""",2107,8000000,5.0,4,4.0,654,1,0,0,0,0,0,0,0


## Set Target

In [11]:
# Create a new column 'TARGET' that's a copy of 'sellPrice'
df = df.with_columns(pl.col('sellPrice').alias('TARGET'))
print(df.head())

shape: (5, 27)
┌────────────┬─────┬────────┬────────────┬───┬────────────┬──────────────┬────────────┬─────────┐
│ Date       ┆ Id  ┆ suburb ┆ postalCode ┆ … ┆ pt_terrace ┆ pt_warehouse ┆ pt_acreage ┆ TARGET  │
│ ---        ┆ --- ┆ ---    ┆ ---        ┆   ┆ ---        ┆ ---          ┆ ---        ┆ ---     │
│ str        ┆ i64 ┆ str    ┆ i64        ┆   ┆ i32        ┆ i32          ┆ i32        ┆ i64     │
╞════════════╪═════╪════════╪════════════╪═══╪════════════╪══════════════╪════════════╪═════════╡
│ 2019-06-19 ┆ 1   ┆ Avalon ┆ 2107       ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 1210000 │
│            ┆     ┆ Beach  ┆            ┆   ┆            ┆              ┆            ┆         │
│ 2019-06-13 ┆ 2   ┆ Avalon ┆ 2107       ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2250000 │
│            ┆     ┆ Beach  ┆            ┆   ┆            ┆              ┆            ┆         │
│ 2019-06-07 ┆ 3   ┆ Whale  ┆ 2107       ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2920000 │
│    

In [12]:
print(df.columns)

['Date', 'Id', 'suburb', 'postalCode', 'sellPrice', 'bed', 'bath', 'car', 'propType', 'suburbs_encoded', 'Date_right', 'suburb_right', 'postalCode_right', 'sellPrice_right', 'bed_right', 'bath_right', 'car_right', 'suburbs_encoded_right', 'pt_house', 'pt_townhouse', 'pt_duplex/semi-detached', 'pt_villa', 'pt_other', 'pt_terrace', 'pt_warehouse', 'pt_acreage', 'TARGET']


## Remove Redundant Features

In [13]:
# Create a copy of the DataFrame (Polars handles this internally)
df_drop = df.clone()

# Drop the specified columns
columns_to_remove = ["Date", "Id", "suburb", "propType", "sellPrice"]
df_drop = df_drop.drop(columns=columns_to_remove)

# Display the first 5 rows after dropping the columns
print(df_drop.head())


shape: (5, 22)
┌────────────┬─────┬──────┬─────┬───┬────────────┬──────────────┬────────────┬─────────┐
│ postalCode ┆ bed ┆ bath ┆ car ┆ … ┆ pt_terrace ┆ pt_warehouse ┆ pt_acreage ┆ TARGET  │
│ ---        ┆ --- ┆ ---  ┆ --- ┆   ┆ ---        ┆ ---          ┆ ---        ┆ ---     │
│ i64        ┆ f64 ┆ i64  ┆ f64 ┆   ┆ i32        ┆ i32          ┆ i32        ┆ i64     │
╞════════════╪═════╪══════╪═════╪═══╪════════════╪══════════════╪════════════╪═════════╡
│ 2107       ┆ 4.0 ┆ 2    ┆ 2.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 1210000 │
│ 2107       ┆ 4.0 ┆ 3    ┆ 4.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2250000 │
│ 2107       ┆ 3.0 ┆ 3    ┆ 2.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 2920000 │
│ 2107       ┆ 3.0 ┆ 1    ┆ 2.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 1530000 │
│ 2107       ┆ 5.0 ┆ 4    ┆ 4.0 ┆ … ┆ 0          ┆ 0            ┆ 0          ┆ 8000000 │
└────────────┴─────┴──────┴─────┴───┴────────────┴──────────────┴────────────┴─────────┘


  df_drop = df_drop.drop(columns=columns_to_remove)


## Check null or inf values

In [14]:
import numpy as np

# Check for Null values across the DataFrame
is_null = df.with_columns(pl.all().is_null().any()).sum(axis=1).sum() > 0
print("Is Null: ", is_null)



Is Null:  True


  is_null = df.with_columns(pl.all().is_null().any()).sum(axis=1).sum() > 0


In [15]:
import polars as pl

# Assuming 'df' is your existing Polars DataFrame
# Check for Null values
contains_null = df.select(pl.any_horizontal(pl.all().is_null().any())).item()

# Print the result for Null values
print("Is Null:", contains_null)


Is Null: True


In [16]:
import polars as pl

# Check for Infinite values, only applying it to numeric columns
#this will error if you try to run on string columns
contains_inf = df.select(
    pl.any_horizontal(
        pl.col(pl.Float64).is_infinite().any()
    )
).item()

# Print the result for Infinite values
print("Is Inf:", contains_inf)


Is Inf: False


In [17]:
# Calculate means for numeric columns
numeric_cols = df.select(pl.col(pl.NUMERIC_DTYPES)).columns
column_means = df.select([pl.col(col).mean() for col in numeric_cols])

# Fill NA values with means for numeric columns
df_filled = df.with_columns([
    pl.col(col).fill_null(column_means.get_column(col)[0])
    for col in numeric_cols
])

print(df_filled.head())

shape: (5, 27)
┌────────────┬─────┬─────────────┬────────────┬───┬────────────┬─────────────┬────────────┬────────┐
│ Date       ┆ Id  ┆ suburb      ┆ postalCode ┆ … ┆ pt_terrace ┆ pt_warehous ┆ pt_acreage ┆ TARGET │
│ ---        ┆ --- ┆ ---         ┆ ---        ┆   ┆ ---        ┆ e           ┆ ---        ┆ ---    │
│ str        ┆ f64 ┆ str         ┆ f64        ┆   ┆ f64        ┆ ---         ┆ f64        ┆ f64    │
│            ┆     ┆             ┆            ┆   ┆            ┆ f64         ┆            ┆        │
╞════════════╪═════╪═════════════╪════════════╪═══╪════════════╪═════════════╪════════════╪════════╡
│ 2019-06-19 ┆ 1.0 ┆ Avalon      ┆ 2107.0     ┆ … ┆ 0.0        ┆ 0.0         ┆ 0.0        ┆ 1.21e6 │
│            ┆     ┆ Beach       ┆            ┆   ┆            ┆             ┆            ┆        │
│ 2019-06-13 ┆ 2.0 ┆ Avalon      ┆ 2107.0     ┆ … ┆ 0.0        ┆ 0.0         ┆ 0.0        ┆ 2.25e6 │
│            ┆     ┆ Beach       ┆            ┆   ┆            ┆            

In [18]:
import polars as pl

# Assuming 'df' is your existing Polars DataFrame
# Check for Null values
contains_null = df_filled.select(pl.any_horizontal(pl.all().is_null().any())).item()

# Print the result for Null values
print("Is Null:", contains_null)



Is Null: False


## Remove Redundat Features

In [19]:
print(df_filled.columns)

['Date', 'Id', 'suburb', 'postalCode', 'sellPrice', 'bed', 'bath', 'car', 'propType', 'suburbs_encoded', 'Date_right', 'suburb_right', 'postalCode_right', 'sellPrice_right', 'bed_right', 'bath_right', 'car_right', 'suburbs_encoded_right', 'pt_house', 'pt_townhouse', 'pt_duplex/semi-detached', 'pt_villa', 'pt_other', 'pt_terrace', 'pt_warehouse', 'pt_acreage', 'TARGET']


In [20]:
# Remove features
df_drop = df_filled.drop(["Date","Date_right", "Id", "suburb", "suburb_right", "propType", "sellPrice_right", "sellPrice"])

print(df_drop.head())

shape: (5, 19)
┌────────────┬─────┬──────┬─────┬───┬────────────┬──────────────┬────────────┬────────┐
│ postalCode ┆ bed ┆ bath ┆ car ┆ … ┆ pt_terrace ┆ pt_warehouse ┆ pt_acreage ┆ TARGET │
│ ---        ┆ --- ┆ ---  ┆ --- ┆   ┆ ---        ┆ ---          ┆ ---        ┆ ---    │
│ f64        ┆ f64 ┆ f64  ┆ f64 ┆   ┆ f64        ┆ f64          ┆ f64        ┆ f64    │
╞════════════╪═════╪══════╪═════╪═══╪════════════╪══════════════╪════════════╪════════╡
│ 2107.0     ┆ 4.0 ┆ 2.0  ┆ 2.0 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 1.21e6 │
│ 2107.0     ┆ 4.0 ┆ 3.0  ┆ 4.0 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 2.25e6 │
│ 2107.0     ┆ 3.0 ┆ 3.0  ┆ 2.0 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 2.92e6 │
│ 2107.0     ┆ 3.0 ┆ 1.0  ┆ 2.0 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 1.53e6 │
│ 2107.0     ┆ 5.0 ┆ 4.0  ┆ 4.0 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 8e6    │
└────────────┴─────┴──────┴─────┴───┴────────────┴──────────────┴────────────┴────────┘


In [21]:
print(df_drop.columns)

['postalCode', 'bed', 'bath', 'car', 'suburbs_encoded', 'postalCode_right', 'bed_right', 'bath_right', 'car_right', 'suburbs_encoded_right', 'pt_house', 'pt_townhouse', 'pt_duplex/semi-detached', 'pt_villa', 'pt_other', 'pt_terrace', 'pt_warehouse', 'pt_acreage', 'TARGET']


## Feature Scaling - Min Max Scaling

In [22]:
import polars as pl

# Assuming df_drop is your Polars DataFrame after dropping unnecessary columns
df_scaling = df_drop.clone()  # Create a copy of df_drop

# Function to check if a column is numeric
def is_numeric(col):
    return col.dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]

# Convert date columns to timestamps, other string columns to categorical
for col in df_scaling.columns:
    if df_scaling[col].dtype == pl.Utf8:
        if "Date" in col:  # Assuming date columns have "Date" in their name
            df_scaling = df_scaling.with_columns(pl.col(col).str.to_date().cast(pl.Int64).alias(col))
        else:
            df_scaling = df_scaling.with_columns(pl.col(col).cast(pl.Categorical).cast(pl.Int32).alias(col))

# Apply min-max scaling to all numeric columns except TARGET
columns_to_scale = [col for col in df_scaling.columns if col != "TARGET" and is_numeric(df_scaling[col])]

df_scaling = df_scaling.with_columns([
    ((pl.col(col) - pl.col(col).min()) / (pl.col(col).max() - pl.col(col).min())).alias(col)
    for col in columns_to_scale
])

print(df_scaling.head())
print(df_scaling.dtypes)


shape: (5, 19)
┌────────────┬──────────┬──────────┬───────┬───┬────────────┬──────────────┬────────────┬────────┐
│ postalCode ┆ bed      ┆ bath     ┆ car   ┆ … ┆ pt_terrace ┆ pt_warehouse ┆ pt_acreage ┆ TARGET │
│ ---        ┆ ---      ┆ ---      ┆ ---   ┆   ┆ ---        ┆ ---          ┆ ---        ┆ ---    │
│ f64        ┆ f64      ┆ f64      ┆ f64   ┆   ┆ f64        ┆ f64          ┆ f64        ┆ f64    │
╞════════════╪══════════╪══════════╪═══════╪═══╪════════════╪══════════════╪════════════╪════════╡
│ 0.037179   ┆ 0.030612 ┆ 0.010204 ┆ 0.025 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 1.21e6 │
│ 0.037179   ┆ 0.030612 ┆ 0.020408 ┆ 0.075 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 2.25e6 │
│ 0.037179   ┆ 0.020408 ┆ 0.020408 ┆ 0.025 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 2.92e6 │
│ 0.037179   ┆ 0.020408 ┆ 0.0      ┆ 0.025 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 1.53e6 │
│ 0.037179   ┆ 0.040816 ┆ 0.030612 ┆ 0.075 ┆ … ┆ 0.0        ┆ 0.0          ┆ 0.0        ┆ 8e6 

## Train Test Split

In [23]:
# Use correct Dataframe
is_deep_learning = False
df_tts = df_scaling.clone() if is_deep_learning else df_drop.clone()
df_tts.head(3)

postalCode,bed,bath,car,suburbs_encoded,postalCode_right,bed_right,bath_right,car_right,suburbs_encoded_right,pt_house,pt_townhouse,pt_duplex/semi-detached,pt_villa,pt_other,pt_terrace,pt_warehouse,pt_acreage,TARGET
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2107.0,4.0,2.0,2.0,22.0,2107.0,4.0,2.0,2.0,22.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1210000.0
2107.0,4.0,3.0,4.0,22.0,2107.0,4.0,3.0,4.0,22.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2250000.0
2107.0,3.0,3.0,2.0,654.0,2107.0,3.0,3.0,2.0,654.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2920000.0


In [24]:
import polars as pl

# Assuming df_tts is already defined as a Polars DataFrame

# Split X and y data
# X will contain all columns except the last one
X = df_tts[:, :-1].to_numpy()

# y will contain only the last column
y = df_tts[:, -1].to_numpy()

# Print the first 2 rows of X
print("X Values: \n", X[:2])

# Print the first 5 rows of y
print("y Values: \n", y[:5])


X Values: 
 [[2.107e+03 4.000e+00 2.000e+00 2.000e+00 2.200e+01 2.107e+03 4.000e+00
  2.000e+00 2.000e+00 2.200e+01 1.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [2.107e+03 4.000e+00 3.000e+00 4.000e+00 2.200e+01 2.107e+03 4.000e+00
  3.000e+00 4.000e+00 2.200e+01 1.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00]]
y Values: 
 [1210000. 2250000. 2920000. 1530000. 8000000.]


In [25]:
# Train Test Split
# x_train and y_train are the training datasets
# X_test and y_test are the testing datasets
# test_size=0.1 means 10% of the data will be used for testing
# random_state=1 ensures reproducibility of the split
# shuffle=True ensures the data is shuffled before splitting; you can decide if you want recent data or not
x_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1, shuffle=True)

# Print the shapes of the resulting datasets
print("x_train shape: ", x_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

x_train shape:  (179553, 18)
X_test shape:  (19951, 18)
y_train shape:  (179553,)
y_test shape:  (19951,)


# Machine Learning

In [26]:
# Import the RandomForestRegressor from sklearn.ensemble
from sklearn.ensemble import RandomForestRegressor # a random forest regressor

# Train Regressor
# Initializing the RandomForestRegressor with the following parameters:
# n_estimators=100: The number of trees in the forest.
# max_depth=10: The maximum depth of each tree.
# random_state=0: Ensures reproducibility of the results by setting the seed.
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)

# Now, you can fit the regressor to your training data
# regressor.fit(x_train, y_train)


In [27]:
dir(RandomForestRegressor)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_compute_oob_predictions',
 '_compute_partial_dependence_recursion',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_get_default_requests',
 '_get_doc_link',
 '_get_estimators_indices',
 '_get_metadata_request',
 '_get_oob_predictions',
 '_get_param_names',
 '_get_tags',
 '_make_estimator',
 '_more_tags',
 '_parameter_constraints',
 '_rep

In [28]:
# Now, you can fit the regressor to your training data
regressor.fit(x_train, y_train)

In [31]:
# make Predictions on Test Set
y_pred = regressor.predict(X_test)
y_pred = [round(x, 0) for x in y_pred] # doing a for loop in a list
print("Test Predictions ", y_pred[:5]) # gets the first 5
print("Test Actuals ", y_test[:5]) # gets the first 5

Test Predictions  [593681.0, 2022675.0, 1112679.0, 1045638.0, 869990.0]
Test Actuals  [ 730000. 1350100.  860000. 1390000.  985000.]


In [32]:
from sklearn.model_selection import RepeatedKFold, cross_val_score

# Check Accuracy
# Define the cross-validation strategy
# n_splits=5: Number of folds in each round of cross-validation
# n_repeats=3: Number of times the cross-validation is repeated
# random_state=1: Seed for reproducibility
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

# Perform cross-validation
# regressor: The model to evaluate
# x_train, y_train: The training data
# scoring="neg_mean_absolute_error": Evaluation metric (negative mean absolute error)
# cv=cv: Cross-validation strategy defined above
# n_jobs=-1: Use all available processors for computation
# error_score="raise": Raise an error if one occurs
n_scores = cross_val_score(regressor, x_train, y_train, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1, error_score="raise")

# Print the cross-validation scores
print("Cross-validation scores:", n_scores)


Cross-validation scores: [-399498.62273728 -372750.85835856 -405046.15401898 -390448.23879278
 -378576.3210264  -373998.60345864 -406443.36539599 -357545.94796804
 -416258.29409479 -386629.6352343  -367704.32557829 -379682.49451279
 -363764.21540351 -417977.97095531 -421167.94399914]


In [34]:
# Reoport Performance
print("MAE Avg: ", abs(n_scores.mean()))
print("MAE Std: ", n_scores.std())

MAE Avg:  389166.1994356542
MAE Std:  20101.47639828163
