# Data import and clean

This file's only purpose is to read in and clean the data to our standard train/test data for 9890 Prediction Competition.

# Read in data, import libraries

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import warnings
import src.workfile_functions as wf

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)

PATH = '../data/pc/'

URL = "https://michael-weylandt.com/STA9890/competition_data/"

In [None]:
# read in from prof's website
BUILDING_DETAILS_2015 = wf.read_gh(URL, "building_details_2015.csv.gz")
BUILDING_DETAILS_2016 = wf.read_gh(URL, "building_details_2016.csv.gz")
BUILDING_DETAILS_2017 = wf.read_gh(URL, "building_details_2017.csv.gz")
BUILDING_DETAILS_2018 = wf.read_gh(URL, "building_details_2018.csv.gz")
BUILDING_DETAILS_2019 = wf.read_gh(URL, "building_details_2019.csv.gz")

In [None]:
TRAINING_SAMPLES = wf.read_gh(URL,"assessment_history_train.csv.gz")
TEST_POINTS = wf.read_gh(URL,"assessment_history_test.csv.gz")

In [None]:
df15 = wf.clean_building_dfs(BUILDING_DETAILS_2015)

In [None]:
df16 = wf.clean_building_dfs(BUILDING_DETAILS_2016)

In [None]:
df17 = wf.clean_building_dfs(BUILDING_DETAILS_2017)

In [None]:
df18 = wf.clean_building_dfs(BUILDING_DETAILS_2018)

In [None]:
df19 = wf.clean_building_dfs(BUILDING_DETAILS_2019)

# Unique Account IDs

In [None]:
df_long = df15[['acct']].copy()
df_long['year'] = 2015

In [None]:
# this function will add new accounts to df_long
# based on their appearances in dfs 16 through 19
def append_new_accts(df_base, df_new, year):
    existing_accts = set(df_base['acct'])
    new_entries = df_new[~df_new['acct'].isin(existing_accts)][['acct']].copy()
    new_entries['year'] = year
    return pd.concat([df_base,new_entries],ignore_index=True)

In [None]:
for df, yr in zip([df16, df17, df18, df19], [2016, 2017, 2018, 2019]):
    df_long = append_new_accts(df_long, df, yr)

# Data Cleaning

## Static Cols

These are columns that should remain unchanged through a home's assesment history

In [None]:
static_cols = [
    'year_built',
    'foundation_Basement',
    'foundation_Crawl Space',
    'foundation_Slab',
    'foundation_Mixed'    
]

In [None]:
year_df_map = {
    2015: df15,
    2016: df16,
    2017: df17,
    2018: df18,
    2019: df19
}

In [None]:
for col in static_cols:
    df_long[col] = np.nan

In [None]:
for year, df_source in year_df_map.items():
    # Get only accounts from df_long for that year
    acct_subset = df_long[df_long['year'] == year][['acct']]

    # Pull static columns from the matching year's DataFrame
    df_extract = df_source[['acct'] + static_cols]

    # Merge on acct (only updates matching rows)
    df_long = df_long.merge(df_extract, on='acct', how='left', suffixes=('', '_tmp'))

    # Only assign static values where year matches (to avoid overwriting)
    for col in static_cols:
        df_long.loc[df_long['year'] == year, col] = df_long.loc[df_long['year'] == year, f'{col}_tmp']
        df_long.drop(columns=[f'{col}_tmp'], inplace=True)


In [None]:
all_columns = df15.columns.tolist()
exclude_cols = ['acct', 'year'] + static_cols  # static_cols from before
first_cols = [col for col in all_columns if col not in exclude_cols]

In [None]:
for year, df_source in year_df_map.items():
    acct_subset = df_long[df_long['year'] == year][['acct']]

    # Pull and rename first_cols → tmp cols
    df_extract = df_source[['acct'] + first_cols].copy()
    df_extract.rename(columns={col: f"{col}_tmp" for col in first_cols}, inplace=True)

    # Merge by acct
    df_long = df_long.merge(df_extract, on='acct', how='left')

    # Assign year-specific values
    for col in first_cols:
        tmp_col = f'{col}_tmp'
        new_col = f'first_{col}'

        if new_col not in df_long:
            df_long[new_col] = np.nan

        df_long.loc[df_long['year'] == year, new_col] = df_long.loc[df_long['year'] == year, tmp_col]
        df_long.drop(columns=[tmp_col], inplace=True)

In [None]:
# Initialize a set to track seen accounts
seen_accts = set()

# Start with a copy of df_long to add 'end_' columns to
df_long_end = df_long.copy()

# Create placeholders for all end_ columns
for col in first_cols:
    df_long_end[f'end_{col}'] = np.nan


## Check for "Last Year"

In [None]:
# Loop in reverse order: 2019 → 2015
for year, df_source in reversed(list(year_df_map.items())):
    # Find accounts in this year that haven't been assigned yet
    df_year_accts = df_source[['acct']].copy()
    new_accts = df_year_accts[~df_year_accts['acct'].isin(seen_accts)]

    # Keep track of which accounts we've already assigned from later years
    seen_accts.update(new_accts['acct'])

    # Pull and rename cols → tmp
    df_extract = df_source[['acct'] + first_cols].copy()
    df_extract.rename(columns={col: f'{col}_tmp' for col in first_cols}, inplace=True)

    # Merge with current df_long_end
    df_long_end = df_long_end.merge(df_extract, on='acct', how='left')

    # Assign values for those just-seen accounts
    for col in first_cols:
        tmp_col = f'{col}_tmp'
        end_col = f'end_{col}'

        mask = df_long_end['acct'].isin(new_accts['acct'])
        df_long_end.loc[mask, end_col] = df_long_end.loc[mask, tmp_col]

        df_long_end.drop(columns=[tmp_col], inplace=True)

## Delta Cols

In [None]:
for col in first_cols:
    start_col = f'first_{col}'
    end_col = f'end_{col}'
    delta_col = f'delta_{col}'

    df_long_end[delta_col] = df_long_end[end_col] - df_long_end[start_col]

# Combine with Train/Test datasets

In [2]:
train = wf.read_gh(URL,"assessment_history_train.csv.gz")
test = wf.read_gh(URL,"assessment_history_test.csv.gz")

In [None]:
train = train.merge(df_long, on = 'acct', how = 'left')
test = test.merge(df_long, on = 'acct', how = 'left')

## Categorical Columns

In [8]:
object_cols = train.select_dtypes(include='object').columns.tolist()

In [21]:
object_cols.remove('acct')

In [11]:
prots = ['protested_2015','protested_2016','protested_2017','protested_2018']

In [13]:
train[prots] = train[prots].fillna(False).astype('int8')

In [15]:
test[prots] = test[prots].fillna(False).astype('int8')

In [16]:
for c in prots:
    object_cols.remove(c)

In [19]:
for col in object_cols:
    freq = train[col].value_counts()
    train[f'{col}_freq'] = train[col].map(freq)
    test[f'{col}_freq'] = test[col].map(freq)  # or use train's freq to avoid leakage

In [23]:
train.drop(object_cols,axis=1,inplace=True)

In [24]:
test.drop(object_cols,axis=1,inplace=True)

## NULL Imputation

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor

for col in missing_cols:
    df_not_missing = train[train[col].notnull()]
    df_missing = train[train[col].isnull()]

    features = [f for f in train.columns if f != col]
    X_missing = df_missing[features].select_dtypes(exclude='object')

    # Sample training data
    df_sample = df_not_missing.sample(n=100000, random_state=42)
    X_train = df_sample[features].select_dtypes(exclude='object')
    y_train = df_sample[col]

    model = HistGradientBoostingRegressor()
    model.fit(X_train, y_train)

    preds = model.predict(X_missing)
    train.loc[train[col].isnull(), col] = preds

In [39]:
missing_cols = test.columns[test.isnull().any()]
print(missing_cols)

Index(['building_area_2015', 'land_area_2015', 'building_area_2016',
       'land_area_2016', 'building_area_2017', 'land_area_2017',
       'building_area_2018', 'land_area_2018', 'building_value_2015',
       'land_value_2015',
       ...
       'delta_brick_masonry', 'delta_concrete_block', 'delta_vinyl',
       'delta_stucco', 'delta_stone', 'delta_other', 'region_freq',
       'zone_freq', 'subneighborhood_freq', 'neighborhood_freq'],
      dtype='object', length=117)


In [40]:
for col in missing_cols:
    df_not_missing = test[test[col].notnull()]
    df_missing = test[test[col].isnull()]

    features = [f for f in test.columns if f != col]
    X_missing = df_missing[features].select_dtypes(exclude='object')

    # Sample training data
    df_sample = df_not_missing.sample(n=100000, random_state=42)
    X_train = df_sample[features].select_dtypes(exclude='object')
    y_train = df_sample[col]

    model = HistGradientBoostingRegressor()
    model.fit(X_train, y_train)

    preds = model.predict(X_missing)
    test.loc[test[col].isnull(), col] = preds

# Saving

In [35]:
train.to_csv('../data/pc/train.csv',index=None)

In [41]:
test.to_csv('../data/pc/test.csv',index=None)