In [90]:
%pip install scikit-learn pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [91]:
from pandas import DataFrame, read_csv

def Get_Listing_Data() -> DataFrame:
    df: DataFrame = read_csv(r"I:\Coding\2024-Ai4Good-Submission\Data\RentalProperties.csv")
    return df

def Clean_Listing_Data(df: DataFrame) -> DataFrame:
    # Null garage spaces is equal to 0 spaces
    df["Garage Spaces"] = df["Garage Spaces"].fillna(0)

    # Replace Y and N with 1 and 0
    df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})

    # Get the numeric features
    df = df[[
        "Book Section",
        "List Price",
        "Postal Code", "Year Built",
        "Living Area", "Bedrooms Total", "Bathrooms Full", "Bathrooms Half", "Bathrooms Total",
        "Garage Spaces",
        "Waterfront YN",
    ]]

    # Drop rows with missing values
    df = df.dropna()

    return df

def Segment_Markets(df: DataFrame) -> dict[str, DataFrame]:
    # Create market segments
    segments: dict[str, DataFrame] = {
        segment: df[df["Book Section"] == segment].drop("Book Section", axis=1) for segment in df["Book Section"].unique()
    }

    # Central limit theorem
    delList: list[str] = [segment for segment in segments if len(segments[segment]) < 30]
    for segment in delList:
        del segments[segment]

    return segments

In [126]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from numpy import ndarray
from statistics import mean, stdev
from datetime import timedelta, date, datetime

ROOT_DATE: str = "2024-07-31"

def Train_Model(df: DataFrame) -> tuple[LinearRegression, list[str], float]:
    iFeatures: DataFrame = df.drop("List Price", axis=1)
    target: list[float] = list(df["List Price"])

    # Select the best features
    selector: SelectKBest = SelectKBest(f_regression, k=5)
    selector.fit(iFeatures, target)
    oFeatures: list[str] = list(selector.get_feature_names_out())

    # Train the model
    model: LinearRegression = LinearRegression().fit(iFeatures[oFeatures], target)
    r_score: float = model.score(iFeatures[oFeatures], target)

    return model, oFeatures, r_score

def Create_Distribution(df: DataFrame, model:LinearRegression, features: list[str]) -> dict[str, list[float]|float]:
    featureDF: DataFrame = df[features]

    expectedPrices: ndarray = model.predict(featureDF)
    differences: list[float] = list(df["List Price"] - expectedPrices)

    return {
        "values": differences,
        "mean": mean(differences),
        "stdev": stdev(differences)
    }

def Move_In_Distribution(df: DataFrame) -> dict[str, list[int]|float]:
    rootDate: date = date.fromisoformat(ROOT_DATE)
    moveInTimes: list[int] = []
    
    # Clean the data
    dateFrame: DataFrame = df.copy()
    dateFrame = dateFrame[[
       "Availability Date", "Days on Market"
    ]].dropna()

    for index, row in dateFrame.iterrows():
        # Construct a date
        dateElements: list[str] = [str(x).zfill(2) for x in (row["Availability Date"]).split("/")]
        constructedDate: date = date.fromisoformat(f"{dateElements[2]}-{dateElements[0]}-{dateElements[1]}")

        # Calculate days until move in
        listDate: date = rootDate - timedelta(row["Days on Market"])
        daysTilMoveIn: int = (constructedDate - listDate).days
        if daysTilMoveIn >= 0:
            moveInTimes.append(daysTilMoveIn)

    return {
        "values": moveInTimes,
        "mean": mean(moveInTimes),
        "stdev": stdev(moveInTimes)
    }


In [127]:
from pickle import dump, load
from os import path, makedirs
from json import dumps, loads

modelDir: str = "Models"
featureDir: str = "Features"
distributionDir: str = "Distributions"
moveInDir: str = "Move_In"

def Save_Model(segment: str, model: LinearRegression) -> None:
    filePath: str = f"{modelDir}/{segment}.pkl"

    if not path.exists(modelDir):
        makedirs(modelDir)

    file = open(filePath, "wb")
    dump(model, file)

def Load_Model(segment: str) -> LinearRegression | None:
    filePath: str = f"{modelDir}/{segment}.pkl"

    try:
        file = open(filePath, "rb")
        return load(file)
    except FileNotFoundError:
        return None
    
def Save_Features(segment: str, features: list[str]) -> None:
    filePath: str = f"{featureDir}/{segment}.json"

    if not path.exists(featureDir):
        makedirs(featureDir)

    with open(filePath, "w") as file:
        file.write(dumps(features))

def Load_Features(segment: str) -> list[str] | None:
    filePath: str = f"{featureDir}/{segment}.json"

    try:
        with open(filePath, "r") as file:
            return loads(file.read())
    except FileNotFoundError:
        return None
    
def Save_Dist(segment: str, distribution: dict[str, list[float]|float]) -> None:
    filePath: str = f"{distributionDir}/{segment}.json"

    if not path.exists(distributionDir):
        makedirs(distributionDir)

    with open(filePath, "w") as file:
        file.write(dumps(distribution))

def Load_Dist(segment: str) -> dict[str, list[float]|float] | None:
    filePath: str = f"{distributionDir}/{segment}.json"

    try:
        with open(filePath, "r") as file:
            return loads(file.read())
    except FileNotFoundError:
        return None
    
def Save_Move_In(segment, moveInData: dict[str, list[float] | float]) -> None:
    filePath: str = f"{moveInDir}/{segment}.json"

    if not path.exists(moveInDir):
        makedirs(moveInDir)

    with open(filePath, "w") as file:
        file.write(dumps(moveInData))

def Load_Move_In(segment: str) -> dict[str, list[float] | float] | None:
    filePath: str = f"{moveInDir}/{segment}.json"

    try:
        with open(filePath, "r") as file:
            return loads(file.read())
    except FileNotFoundError:
        return None    

In [128]:
data = Clean_Listing_Data(Get_Listing_Data())
segments = Segment_Markets(data)

for segment, data in segments.items():
    model, oFeatures, rScore = Train_Model(data)
    distribution = Create_Distribution(data, model, oFeatures)

    # Save data
    Save_Features(segment, oFeatures)
    Save_Model(segment, model)
    Save_Dist(segment, distribution)

    print(f"{segment}: {round((rScore * 100), 2)}% of Variance Explained")

for segment, data in Segment_Markets(Get_Listing_Data()).items():
    # Track move in dates
    moveInData: dict[str, list[int] | float] = Move_In_Distribution(data)

    # Save Data
    Save_Move_In(segment, moveInData)
    

  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})


Townhouse: 43.27% of Variance Explained
Single Family Residence: 41.47% of Variance Explained
Condominium: 49.3% of Variance Explained
Apartment: 61.1% of Variance Explained
Duplex: 53.2% of Variance Explained


In [129]:
from statistics import NormalDist
def Is_Too_Cheap(data: DataFrame, features: list[str], model: LinearRegression, mean: float, stdev: float, confidence: float = 0.05) -> bool:
    if data.shape[0] < 1:
        return False

    actualPrice: float = data["List Price"].values[0]
    expectedPrice: float = model.predict(data[features])[0]

    zScore: float = ((expectedPrice - actualPrice) - mean) / stdev
    percentile: float = NormalDist().cdf(zScore)
    if abs(percentile) < confidence:
        return True

    return False #(expectedPrice - actualPrice) < (mean - (stdev * deviation))

def Is_Too_Soon(data: DataFrame, mean: float, stdev: float, confidence: float = 0.05) -> bool:
        rootDate = date.fromisoformat("2024-07-31")

        try:
            # Construct a date
            dateElements: list[str] = [str(x).zfill(2) for x in (data["Availability Date"][0]).split("/")]
            constructedDate: date = date.fromisoformat(f"{dateElements[2]}-{dateElements[0]}-{dateElements[1]}")

            # Calculate day until move in
            listDate: date = rootDate - timedelta(data["Days on Market"][0])
            daysTilMoveIn: int = (constructedDate - listDate).days

            zScore: float = (daysTilMoveIn - mean) / stdev
            percentile: float = NormalDist().cdf(zScore)
            if abs(percentile) < confidence:
                return True
            return False
        
        except Exception:
            return False

In [134]:
data = Get_Listing_Data()
priceFruad: list[int] = []
timeFruad: list[int] = []

for index, row in data.iterrows():
    marketSegment: str = row["Book Section"]

    model: LinearRegression = Load_Model(marketSegment)
    features: list[str] = Load_Features(marketSegment)
    distribution: dict[str, list[float]|float] = Load_Dist(marketSegment)

    if model is None or features is None or distribution is None:
        continue

    # Validate Pricing
    isTooCheap: bool = Is_Too_Cheap(
        data=Clean_Listing_Data(row.to_frame().T),
        features=features,
        model=model, 
        mean=distribution["mean"], 
        stdev=distribution["stdev"],
        confidence=0.05
    )
    if isTooCheap:
        priceFruad.append(row['List Number'])

    # Validate Move in date
    moveInData: dict[str, list[int]|int] = Load_Move_In(marketSegment)
    isTooSoon: bool = Is_Too_Soon(
        data=row.to_frame().T,
        mean=moveInData["mean"],
        stdev=moveInData["stdev"],
        confidence=0.5
    )
    if isTooSoon:
        timeFruad.append(row['List Number'])

import os
os.system("cls")
print(f"Fraud Count: {len(priceFruad)} | {len(timeFruad)}")

  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
 

Fraud Count: 84 | 1


  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
  df["Waterfront YN"] = df["Waterfront YN"].replace(to_replace={"Y": 1, "N": 0})
  df["Garage Spaces"] = df["Garage Spaces"].fillna(0)
 