In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import warnings
import src.workfile_functions as wf

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)

PATH = '../data/pc/'

URL = "https://michael-weylandt.com/STA9890/competition_data/"

In [2]:
# read in from prof's website
BUILDING_DETAILS_2015 = wf.read_gh(URL, "building_details_2015.csv.gz")
BUILDING_DETAILS_2016 = wf.read_gh(URL, "building_details_2016.csv.gz")
BUILDING_DETAILS_2017 = wf.read_gh(URL, "building_details_2017.csv.gz")
BUILDING_DETAILS_2018 = wf.read_gh(URL, "building_details_2018.csv.gz")
BUILDING_DETAILS_2019 = wf.read_gh(URL, "building_details_2019.csv.gz")

In [3]:
TRAINING_SAMPLES = wf.read_gh(URL,"assessment_history_train.csv.gz")
TEST_POINTS = wf.read_gh(URL,"assessment_history_test.csv.gz")

In [None]:
df15 = wf.clean_building_dfs(BUILDING_DETAILS_2015)

In [None]:
df16 = wf.clean_building_dfs(BUILDING_DETAILS_2016)

In [None]:
df17 = wf.clean_building_dfs(BUILDING_DETAILS_2017)

In [None]:
df18 = wf.clean_building_dfs(BUILDING_DETAILS_2018)

In [None]:
df19 = wf.clean_building_dfs(BUILDING_DETAILS_2019)

# Combine Data

In [9]:
df15[df15['acct']=='bb75f25168addc1117840b10c0fd6cd0c2a7b7c6']

Unnamed: 0,acct,floor_area_primary,floor_area_upper,floor_area_lower,garage_area,porch_area,deck_area,mobile_home_area,floors,half_bath,full_bath,total_rooms,bedrooms,fireplaces,elevator,quality,quality_description,year_built,year_remodeled,building_condition,grade,has_cooling,has_heat,physical_condition,year,foundation_Basement,foundation_Crawl Space,foundation_Mixed,foundation_Slab,brick_veneer,brick_masonry,concrete_block,vinyl,stucco,stone,other
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,1658.0,879.0,0.0,0.0,266.0,0.0,0.0,2.0,1.0,2.0,8.0,3.0,0.0,0,4.0,4.0,2004.0,0.0,3.0,4.3,1,1,3.0,2015.0,0,0,0,1,0,0,0,0,1,0,0


In [None]:
df_long = df15[['acct']].copy()
df_long['year'] = 2015

In [11]:
df_long.head()

Unnamed: 0,acct,year
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,2015
1,5dd766a35cebbcbfa063316adb679131dbb9f92c,2015
2,8def0ccceda200b673872a8a9367644767989f3b,2015
3,7592093bcff4a8463064ccc1ead39e6dbdb819c9,2015
4,ca33e57b3b13e843909f4b6cbd9a3410387bd45a,2015


In [None]:
def append_new_accts(df_base, df_new, year):
    existing_accts = set(df_base['acct'])
    new_entries = df_new[~df_new['acct'].isin(existing_accts)][['acct']].copy()
    new_entries['year'] = year
    return pd.concat([df_base,new_entries],ignore_index=True)

In [None]:
for df, yr in zip([df16, df17, df18, df19], [2016, 2017, 2018, 2019]):
    df_long = append_new_accts(df_long, df, yr)

In [11]:
df_long['year'].value_counts()

year
2015    990765
2016     15355
2018     14498
2017     14037
2019     12275
Name: count, dtype: int64

In [15]:
df.year_built.value_counts()

year_built
2006.0    31296
2005.0    30642
1950.0    29883
1983.0    29408
1978.0    26539
          ...  
1843.0        1
1886.0        1
1830.0        1
1873.0        1
1849.0        1
Name: count, Length: 152, dtype: int64

In [16]:
df.columns

Index(['acct', 'floor_area_primary', 'floor_area_upper', 'floor_area_lower',
       'garage_area', 'porch_area', 'deck_area', 'mobile_home_area', 'floors',
       'half_bath', 'full_bath', 'total_rooms', 'bedrooms', 'fireplaces',
       'elevator', 'quality', 'quality_description', 'year_built',
       'year_remodeled', 'building_condition', 'grade', 'has_cooling',
       'has_heat', 'physical_condition', 'year', 'foundation_Basement',
       'foundation_Crawl Space', 'foundation_Mixed', 'foundation_Slab',
       'brick_veneer', 'brick_masonry', 'concrete_block', 'vinyl', 'stucco',
       'stone', 'other'],
      dtype='object')

In [None]:
static_cols = [
    'year_built',
    'foundation_Basement',
    'foundation_Crawl Space',
    'foundation_Slab',
    'foundation_Mixed'    
]

In [None]:
year_df_map = {
    2015: df15,
    2016: df16,
    2017: df17,
    2018: df18,
    2019: df19
}

In [None]:
for col in static_cols:
    df_long[col] = np.nan

In [None]:
for year, df_source in year_df_map.items():
    # Get only accounts from df_long for that year
    acct_subset = df_long[df_long['year'] == year][['acct']]

    # Pull static columns from the matching year's DataFrame
    df_extract = df_source[['acct'] + static_cols]

    # Merge on acct (only updates matching rows)
    df_long = df_long.merge(df_extract, on='acct', how='left', suffixes=('', '_tmp'))

    # Only assign static values where year matches (to avoid overwriting)
    for col in static_cols:
        df_long.loc[df_long['year'] == year, col] = df_long.loc[df_long['year'] == year, f'{col}_tmp']
        df_long.drop(columns=[f'{col}_tmp'], inplace=True)


In [None]:
df_long

In [None]:
all_columns = df15.columns.tolist()
exclude_cols = ['acct', 'year'] + static_cols  # static_cols from before
first_cols = [col for col in all_columns if col not in exclude_cols]

In [None]:
for year, df_source in year_df_map.items():
    acct_subset = df_long[df_long['year'] == year][['acct']]

    # Pull and rename first_cols → tmp cols
    df_extract = df_source[['acct'] + first_cols].copy()
    df_extract.rename(columns={col: f"{col}_tmp" for col in first_cols}, inplace=True)

    # Merge by acct
    df_long = df_long.merge(df_extract, on='acct', how='left')

    # Assign year-specific values
    for col in first_cols:
        tmp_col = f'{col}_tmp'
        new_col = f'first_{col}'

        if new_col not in df_long:
            df_long[new_col] = np.nan

        df_long.loc[df_long['year'] == year, new_col] = df_long.loc[df_long['year'] == year, tmp_col]
        df_long.drop(columns=[tmp_col], inplace=True)

In [None]:
df_long

In [None]:
# Initialize a set to track seen accounts
seen_accts = set()

# Start with a copy of df_long to add 'end_' columns to
df_long_end = df_long.copy()

# Create placeholders for all end_ columns
for col in first_cols:
    df_long_end[f'end_{col}'] = np.nan


In [None]:

# Loop in reverse order: 2019 → 2015
for year, df_source in reversed(list(year_df_map.items())):
    # Find accounts in this year that haven't been assigned yet
    df_year_accts = df_source[['acct']].copy()
    new_accts = df_year_accts[~df_year_accts['acct'].isin(seen_accts)]

    # Keep track of which accounts we've already assigned from later years
    seen_accts.update(new_accts['acct'])

    # Pull and rename cols → tmp
    df_extract = df_source[['acct'] + first_cols].copy()
    df_extract.rename(columns={col: f'{col}_tmp' for col in first_cols}, inplace=True)

    # Merge with current df_long_end
    df_long_end = df_long_end.merge(df_extract, on='acct', how='left')

    # Assign values for those just-seen accounts
    for col in first_cols:
        tmp_col = f'{col}_tmp'
        end_col = f'end_{col}'

        mask = df_long_end['acct'].isin(new_accts['acct'])
        df_long_end.loc[mask, end_col] = df_long_end.loc[mask, tmp_col]

        df_long_end.drop(columns=[tmp_col], inplace=True)

In [None]:
df_long_end

In [None]:
for col in first_cols:
    start_col = f'first_{col}'
    end_col = f'end_{col}'
    delta_col = f'delta_{col}'

    df_long_end[delta_col] = df_long_end[end_col] - df_long_end[start_col]

In [None]:
df_long_end['acct'].nunique()

In [22]:
df['acct'].nunique()

1046882

In [31]:
df15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990765 entries, 0 to 990764
Data columns (total 36 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   acct                    990765 non-null  object 
 1   floor_area_primary      990764 non-null  float64
 2   floor_area_upper        990764 non-null  float64
 3   floor_area_lower        990764 non-null  float64
 4   garage_area             990764 non-null  float64
 5   porch_area              990764 non-null  float64
 6   deck_area               990764 non-null  float64
 7   mobile_home_area        990764 non-null  float64
 8   floors                  990763 non-null  float64
 9   half_bath               990763 non-null  float64
 10  full_bath               990763 non-null  float64
 11  total_rooms             990763 non-null  float64
 12  bedrooms                990763 non-null  float64
 13  fireplaces              990763 non-null  float64
 14  elevator            

In [None]:
df_long_end.to_csv('../data/pc/merge_data.csv',index=None)

In [None]:
train = TRAINING_SAMPLES.copy(deep=True)
test = TEST_POINTS.copy(deep=True)

In [33]:
train = train.merge(df_long_end, on = 'acct', how = 'left')

: 

In [None]:
test = test.merge(df_long_end, on = 'acct', how = 'left')