# House Prices - Advanced Regression Techniques

You mus have joined the competition on Kaggle before run this notebook.
https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview

In [None]:
import pandas as pd
import os
import zipfile
import kaggle

DATASET_DIR = ".dataset/house_prices/"
EXPORT_DIR = ".dataset/preprocessed/house_prices/"

if not os.path.exists(DATASET_DIR):
    os.makedirs(DATASET_DIR, exist_ok=True)
    kaggle.api.competition_download_files(
        competition="house-prices-advanced-regression-techniques",
        path=DATASET_DIR,
        quiet=False,
    )
    with zipfile.ZipFile(
        os.path.join(DATASET_DIR, "house-prices-advanced-regression-techniques.zip"),
        "r",
    ) as zip_ref:
        zip_ref.extractall(DATASET_DIR)
    os.remove(
        os.path.join(DATASET_DIR, "house-prices-advanced-regression-techniques.zip")
    )

In [39]:
def combine_sample_submission_and_test(
    test_df: pd.DataFrame, sample_submission_df: pd.DataFrame
) -> pd.DataFrame:
    return pd.merge(test_df, sample_submission_df, on="Id", how="outer")


def connect_dataframes(a: pd.DataFrame, b: pd.DataFrame) -> pd.DataFrame:
    return pd.concat([a, b], ignore_index=True, sort=False)


def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(columns=["Id"])
    return df


def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    categorical_cols = df.select_dtypes(include=["object"]).columns
    return pd.get_dummies(df, columns=categorical_cols, drop_first=True)


def sale_price_to_last_column(df: pd.DataFrame) -> pd.DataFrame:
    if "SalePrice" in df.columns:
        cols = list(df.columns)
        cols.remove("SalePrice")
        cols.append("SalePrice")
        return df[cols]
    return df


def category_to_numeric(df: pd.DataFrame, col: str, ranks: list) -> pd.DataFrame:
    df[col] = df[col].map({rank: i for i, rank in enumerate(ranks)}).astype("Int8")
    return df


def convert_cond_and_qual_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
    for col in [
        "ExterQual",
        "ExterCond",
        "BsmtQual",
        "BsmtCond",
        "HeatingQC",
        "KitchenQual",
        "FireplaceQu",
        "GarageQual",
        "GarageCond",
        "PoolQC",
    ]:
        if col in df.columns:
            df = category_to_numeric(df, col, ["Ex", "Gd", "TA", "Fa", "Po", "NA"])
    df = category_to_numeric(df, "LotShape", ["Reg", "IR1", "IR2", "IR3"])
    df = category_to_numeric(df, "LandSlope", ["Gtl", "Mod", "Sev"])
    df = category_to_numeric(df, "LandContour", ["Lvl", "Bnk", "HLS", "Low"])
    df = category_to_numeric(df, "Utilities", ["ELO", "NoSewr", "NoSeWa", "AllPub"])
    df = category_to_numeric(df, "PavedDrive", ["Y", "P", "N"])
    df = category_to_numeric(
        df, "Electrical", ["SBrkr", "FuseA", "FuseF", "FuseP", "Mix"]
    )
    df = category_to_numeric(
        df, "Functional", ["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"]
    )
    df = category_to_numeric(
        df,
        "GarageType",
        ["2Types", "Attchd", "Basment", "BuiltIn", "CarPort", "Detchd", "NA"],
    )
    df = category_to_numeric(df, "GarageFinish", ["Fin", "RFn", "Unf", "NA"])
    df = category_to_numeric(df, "Fence", ["GdPrv", "MnPrv", "GdWo", "MnWw", "NA"])
    return df


train_df = pd.read_csv(os.path.join(DATASET_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATASET_DIR, "test.csv"))
sample_submission_df = pd.read_csv(os.path.join(DATASET_DIR, "sample_submission.csv"))

test_df = combine_sample_submission_and_test(test_df, sample_submission_df)
train_df = connect_dataframes(train_df, test_df)

train_df = drop_columns(train_df)

train_df = convert_cond_and_qual_to_numeric(train_df)

train_df = one_hot_encode(train_df)
train_df = sale_price_to_last_column(train_df)

train_df.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'LotShape', 'LandContour',
       'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family',
       'SaleCondition_Normal', 'SaleCondition_Partial', 'SalePrice'],
      dtype='object', length=200)

In [40]:
os.makedirs(EXPORT_DIR, exist_ok=True)
train_df.to_csv(os.path.join(EXPORT_DIR, "train.csv"), index=False)