In [78]:
import pandas as pd
import numpy as np
import regex as re
import gc

gc.collect()

0

In [79]:
original_df = pd.read_csv('newyork_housing.csv', low_memory=False)

In [80]:
original_df.shape

(75630, 1507)

In [81]:
df = original_df.copy()

In [82]:
# remove all photo url columns
df = df[df.columns.drop(list(df.filter(regex='photos')))]

# add number of photos attribute
number_of_photos = np.sum(pd.isnull(original_df.filter(like='photos', axis=1).values)==False, axis=1)
df['number_of_photos'] = number_of_photos

In [83]:
df = df.dropna(subset=['price']).reset_index(drop=True)

In [84]:
# remove all listings outside of NY
rows_to_drop = np.where(df['address/state'].values != 'NY')[0]
df = df.drop(df.index[rows_to_drop]).reset_index(drop=True)

# remove all columns that look or found unrelevant
list_of_columns_to_drop = [
    'address/state',
    'address/streetAddress',
    'currency',
    'resoFactsStats/parcelNumber',
    'schools/0/link',
    'schools/0/name',
    'schools/0/totalCount',
    'schools/1/link',
    'schools/1/name',
    'schools/1/totalCount',
    'schools/2/link',
    'schools/2/name',
    'schools/2/totalCount',
    'schools/0/isAssigned',
    #'resoFactsStats/atAGlanceFacts/0/factValue',
    #'resoFactsStats/atAGlanceFacts/1/factValue',
    #'resoFactsStats/atAGlanceFacts/2/factValue',
    #'resoFactsStats/atAGlanceFacts/3/factValue',
    'resoFactsStats/atAGlanceFacts/4/factValue',
    #'resoFactsStats/atAGlanceFacts/5/factValue',
    'schools/2/type',
    'schools/0/grades',
    'schools/1/grades',
    'schools/2/grades',
]

for column_name in list_of_columns_to_drop:
    df = df.drop(column_name, axis=1)

In [85]:
# remove all columns with more than 50% missing values
percent_missing_threshold = 100/3

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)

columns_to_drop = missing_value_df.loc[missing_value_df['percent_missing'] > percent_missing_threshold].values[:, 0].tolist()

# Keep price history variables
for x in ['event', 'postingIsRental', 'price', 'priceChangeRate', 'time']:
    regex = re.compile(r'priceHistory/./' + x + r"(?:$|/)")
    columns_to_drop = [i for i in columns_to_drop if not regex.search(i)]
    
columns_to_drop.remove('resoFactsStats/atAGlanceFacts/6/factValue') # Days on Zillow
#print(columns_to_drop)
df = df.drop(columns_to_drop, axis=1)

In [86]:
# Convert date variable
regex = re.compile(r'priceHistory/./time')
time = [i for i in df.columns if regex.search(i)]
for i in time:
    df.loc[:, i] = pd.to_datetime(df.loc[:, i], unit='ms')
print(df[time])

      priceHistory/0/time priceHistory/1/time priceHistory/2/time  \
0              2021-01-08          2016-04-01          2015-03-20   
1              2020-12-07          2020-10-01          2020-07-28   
2              2021-01-20          2020-06-20                 NaT   
3              2020-12-17          2020-09-22          2019-03-07   
4              2020-11-18                 NaT                 NaT   
...                   ...                 ...                 ...   
75563          2018-07-31          2018-02-27          2018-02-14   
75564          2019-07-01          2019-06-20          2019-04-28   
75565          2018-07-24          2018-04-10          2018-03-22   
75566          2018-12-26          2008-05-21                 NaT   
75567          2019-05-15          2019-03-18          2018-11-13   

      priceHistory/3/time priceHistory/4/time priceHistory/5/time  \
0              2010-12-23          2010-06-10          2010-06-04   
1              2020-03-14        

In [87]:
# Extract year
j = 0
for i in time:
    df['priceHistory/' + str(j) + '/year'] = pd.DatetimeIndex(df[i]).year
    #df['priceHistory/' + str(j) + '/year'] = df['priceHistory/' + str(j) + '/year'].fillna(0).astype(int)
    j +=1
print(df['priceHistory/0/year'])

0        2021.0
1        2020.0
2        2021.0
3        2020.0
4        2020.0
          ...  
75563    2018.0
75564    2019.0
75565    2018.0
75566    2018.0
75567    2019.0
Name: priceHistory/0/year, Length: 75568, dtype: float64


In [88]:
# Month year
j = 0
for i in time:
    df['priceHistory/' + str(j) + '/month_year'] = df[i].dt.to_period('M')
    j +=1
print(df['priceHistory/0/month_year'])

0        2021-01
1        2020-12
2        2021-01
3        2020-12
4        2020-11
          ...   
75563    2018-07
75564    2019-07
75565    2018-07
75566    2018-12
75567    2019-05
Name: priceHistory/0/month_year, Length: 75568, dtype: period[M]


In [89]:
# Collapse events
#print(df['priceHistory/0/event'].unique())
regex = re.compile(r'priceHistory/./event')
event = [i for i in df.columns if regex.search(i)]
event_name = list(df[event].apply(pd.value_counts).sum(axis = 1).index)
for x in event_name:
    df['number_of_' + x] = np.sum(df[event] == x, axis=1)
#print(np.sum(df[event] == 'Sold', axis=1), df['number_of_Sold'])

In [90]:
# add description length and exist attributes and remove description column

description_lengths = np.zeros((df['description'].shape[0],), dtype=int)

for i, description in enumerate(df['description'].values.tolist()):
    if pd.isnull(df['description'][i]) == True:
        continue
    if '\n' in description:
        description = description.replace('\n', ' ')
        df['description'][i] = description
        description_lengths[i] = len(description.split(' '))+1

description_exists = description_lengths>0

df['description_exists'] = description_exists
df['description_lengths'] = description_lengths
df.drop('description', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'][i] = description


In [91]:
# remove all columns with unnecessary priceHistory information and keep only priceChangeRate
#df = df[df.columns.drop(list(df.filter(regex='priceHistory')))]
#df['priceChangeRate'] = original_df['priceHistory/0/priceChangeRate']

columns_to_drop = list(df.filter(regex='priceHistory'))

# Keep price history variables
for x in ['event', 'postingIsRental', 'price', 'priceChangeRate', 'time', 'year', 'month_year']:
    regex = re.compile(r'priceHistory/./' + x)
    columns_to_drop = [i for i in columns_to_drop if not regex.search(i)]
print(columns_to_drop)
df = df.drop(columns_to_drop, axis=1)

['priceHistory/0/attributeSource/infoString2', 'priceHistory/0/showCountyLink', 'priceHistory/0/source', 'priceHistory/1/attributeSource/infoString2', 'priceHistory/1/showCountyLink', 'priceHistory/1/source', 'priceHistory/2/attributeSource/infoString2', 'priceHistory/2/showCountyLink', 'priceHistory/2/source']


In [92]:
# Numeric days on Zillow
#df['days_on_Zillow'] = df['resoFactsStats/atAGlanceFacts/6/factValue'].str.extract('(\d+)').fillna(0).astype(int)
df['days_on_Zillow'] = df['resoFactsStats/atAGlanceFacts/6/factValue'].str.extract('(\d+)')
df['days_on_Zillow'] = pd.to_numeric(df['days_on_Zillow'])
print(df['days_on_Zillow'], df['resoFactsStats/atAGlanceFacts/6/factValue'])
df = df.drop('resoFactsStats/atAGlanceFacts/6/factValue', axis=1)

0         12.0
1        176.0
2        214.0
3        120.0
4         62.0
         ...  
75563      NaN
75564      NaN
75565      NaN
75566      NaN
75567      NaN
Name: days_on_Zillow, Length: 75568, dtype: float64 0         12 Days
1        176 Days
2        214 Days
3        120 Days
4         62 Days
           ...   
75563         NaN
75564         NaN
75565         NaN
75566         NaN
75567         NaN
Name: resoFactsStats/atAGlanceFacts/6/factValue, Length: 75568, dtype: object


In [93]:
for i, column_name in enumerate(df.columns[::-1]):
    current_column_values = df[column_name].values
    values, counts = np.unique(current_column_values.astype(str), return_counts=True)
    if np.max(counts) > 0.975*df.shape[0]:
        #print(column_name, counts)
        df = df.drop(column_name, axis=1)
    #else:
        #print(column_name, np.max(counts)/df.shape[0])

In [94]:
url_exists = np.zeros((df['url'].shape[0],), dtype=int)

for i, url in enumerate(df['url'].values.tolist()):
    if pd.isnull(df['url'][i]) == True:
        continue
    else:
        url_exists[i] = 1

df['url_exists'] = url_exists
df.drop('url', axis=1, inplace=True)

In [95]:
#all columns with sqft or acres - acres to sqft multiply the acre value by 43560
for i, column_name in enumerate(df.columns[::-1]):
    if (df[column_name].astype(str).str.contains(' Acres').any() or df[column_name].astype(str).str.contains(' sqft').any()) and column_name != 'description':
        print(column_name)
        Acres_rows = np.where(df[column_name].astype(str).str.contains(' Acres'))[0]
        df[column_name][Acres_rows] = df[column_name][Acres_rows].astype(str).str.replace(',','', regex=True).replace(' Acres','', regex=True).astype(float)*43560

        sqft_rows = np.where(df[column_name].astype(str).str.contains(' sqft'))[0]
        df[column_name][sqft_rows] = df[column_name][sqft_rows].astype(str).str.replace(',', '', regex=True).replace(' sqft', '', regex=True).astype(float)

resoFactsStats/lotSize


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][Acres_rows] = df[column_name][Acres_rows].astype(str).str.replace(',','', regex=True).replace(' Acres','', regex=True).astype(float)*43560
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][sqft_rows] = df[column_name][sqft_rows].astype(str).str.replace(',', '', regex=True).replace(' sqft', '', regex=True).astype(float)


resoFactsStats/livingArea


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][Acres_rows] = df[column_name][Acres_rows].astype(str).str.replace(',','', regex=True).replace(' Acres','', regex=True).astype(float)*43560
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][sqft_rows] = df[column_name][sqft_rows].astype(str).str.replace(',', '', regex=True).replace(' sqft', '', regex=True).astype(float)


In [96]:
bool_to_categories = []
for i, column_name in enumerate(df.columns[::-1]):
    if np.sum(np.where(df[column_name]==False)[0].size + np.where(df[column_name]==True)[0].size + np.where(df[column_name].values.astype('str')=='nan')[0].size)==df.shape[0]:
        bool_to_categories.append(column_name)
        df[column_name][np.where(df[column_name]==False)[0]] = 0
        df[column_name][np.where(df[column_name]==True)[0]] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][np.where(df[column_name]==False)[0]] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][np.where(df[column_name]==True)[0]] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][np.where(df[column_name]==False)[0]] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [97]:
string_to_categories = [
    'address/city',
    'homeStatus',
    'resoFactsStats/atAGlanceFacts/0/factValue',
    'resoFactsStats/atAGlanceFacts/5/factLabel',
    'resoFactsStats/cityRegion',
    'resoFactsStats/homeType',
    'resoFactsStats/parkingFeatures/0',
    'schools/0/level',
    'schools/1/level',
    'schools/2/level',
    'schools/0/grades',
    'schools/1/grades',
    'schools/2/grades'
]


# factorize all string columns
for i, column_name in enumerate(df.columns[::-1]):
    if column_name in string_to_categories:
        #print(column_name, np.unique(df[column_name].values.astype('str')))
        if '/level' in column_name:
            df[column_name][np.where(df[column_name]=='Primary')[0]] = 0
            df[column_name][np.where(df[column_name]=='Elementary')[0]] = 1
            df[column_name][np.where(df[column_name]=='Middle')[0]] = 2
            df[column_name][np.where(df[column_name]=='High')[0]] = 3
        else:
            rows_to_factorize = np.where(df[column_name].isnull()==False)[0]
            df[column_name][rows_to_factorize] = pd.factorize(df[column_name][rows_to_factorize])[0]
        #print(df[column_name][rows_to_factorize])
        #print(pd.factorize(df[column_name][rows_to_factorize])[0])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][np.where(df[column_name]=='Primary')[0]] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][np.where(df[column_name]=='Elementary')[0]] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name][np.where(df[column_name]=='Middle')[0]] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

In [98]:
df = df.loc[(df['homeStatus'] <= 2)]
df = df.loc[(df['price'] > 100)]

In [99]:
#df.to_csv('processed_dataset.csv')
df.to_csv('processed_dataset_v2.csv')

## Create time series dataset

In [100]:
matching_strings = []
for x in ['event', 'postingIsRental', 'price', 'priceChangeRate', 'time', 'year', 'month_year']:
    regex = re.compile(r'priceHistory/./' + x + r"(?:$|/)")
    matching_strings += [s for s in df.columns if regex.match(s)]
print(matching_strings)

['priceHistory/0/event', 'priceHistory/1/event', 'priceHistory/2/event', 'priceHistory/3/event', 'priceHistory/4/event', 'priceHistory/5/event', 'priceHistory/6/event', 'priceHistory/7/event', 'priceHistory/8/event', 'priceHistory/9/event', 'priceHistory/0/postingIsRental', 'priceHistory/1/postingIsRental', 'priceHistory/2/postingIsRental', 'priceHistory/3/postingIsRental', 'priceHistory/4/postingIsRental', 'priceHistory/5/postingIsRental', 'priceHistory/6/postingIsRental', 'priceHistory/7/postingIsRental', 'priceHistory/8/postingIsRental', 'priceHistory/9/postingIsRental', 'priceHistory/0/price', 'priceHistory/1/price', 'priceHistory/2/price', 'priceHistory/3/price', 'priceHistory/4/price', 'priceHistory/5/price', 'priceHistory/6/price', 'priceHistory/7/price', 'priceHistory/8/price', 'priceHistory/9/price', 'priceHistory/0/priceChangeRate', 'priceHistory/1/priceChangeRate', 'priceHistory/2/priceChangeRate', 'priceHistory/3/priceChangeRate', 'priceHistory/4/priceChangeRate', 'priceHis

In [101]:
# Subset data
subset = ['zpid', 'address/zipcode', 'price', 'days_on_Zillow'] + matching_strings
ts_wide = df[subset] # reshape here 
display(ts_wide)

Unnamed: 0,zpid,address/zipcode,price,days_on_Zillow,priceHistory/0/event,priceHistory/1/event,priceHistory/2/event,priceHistory/3/event,priceHistory/4/event,priceHistory/5/event,...,priceHistory/0/month_year,priceHistory/1/month_year,priceHistory/2/month_year,priceHistory/3/month_year,priceHistory/4/month_year,priceHistory/5/month_year,priceHistory/6/month_year,priceHistory/7/month_year,priceHistory/8/month_year,priceHistory/9/month_year
0,3.155405e+07,10463.0,799999.0,12.0,Listed for sale,Listing removed,Listed for sale,Listing removed,Price change,Price change,...,2021-01,2016-04,2015-03,2010-12,2010-06,2010-06,2010-02,NaT,NaT,NaT
1,2.985412e+07,10471.0,3995000.0,176.0,Price change,Price change,Listed for sale,Listing removed,Listed for sale,Sold,...,2020-12,2020-10,2020-07,2020-03,2019-11,2015-12,2015-11,2015-08,2015-05,NaT
2,2.985186e+07,10463.0,1495000.0,214.0,Price change,Listed for sale,,,,,...,2021-01,2020-06,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,2.985186e+07,10463.0,3450000.0,120.0,Price change,Listed for sale,Sold,Listing removed,Pending sale,Listed for sale,...,2020-12,2020-09,2019-03,2019-02,2017-11,2017-09,2000-08,NaT,NaT,NaT
4,2.077107e+09,10463.0,1790000.0,62.0,Listed for sale,,,,,,...,2020-11,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75563,3.200295e+07,11375.0,825000.0,,Sold,Listing removed,Listed for sale,,,,...,2018-07,2018-02,2018-02,NaT,NaT,NaT,NaT,NaT,NaT,NaT
75564,3.200501e+07,11375.0,2704000.0,,Sold,Listing removed,Pending sale,Listed for sale,Pending sale,Listing removed,...,2019-07,2019-06,2019-04,2019-04,2019-03,2019-03,2019-03,2019-02,2019-01,2018-10
75565,3.200501e+07,11375.0,2750000.0,,Sold,Pending sale,Listed for sale,,,,...,2018-07,2018-04,2018-03,NaT,NaT,NaT,NaT,NaT,NaT,NaT
75566,3.202000e+07,11375.0,935000.0,,Sold,Sold,,,,,...,2018-12,2008-05,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT


In [102]:
# Clear memory
del original_df
del df
gc.collect()

0

In [103]:
pd.Series(ts_wide["zpid"]).is_unique

False

In [104]:
# Create uid to reshape
ts_wide['row_num'] = np.arange(len(ts_wide))
ts_wide.columns
original_ts_wide = ts_wide.copy()

In [105]:
# Reshape
i = 0 
for x in ['event', 'postingIsRental', 'price', 'priceChangeRate', 'time', 'year', 'month_year']:
    # Rename for unique stub
    regex = re.compile(r"^priceHistory/(?:\d+/)*" + x + r"(?:$|/)")
    matching_strings = [s for s in ts_wide.columns if regex.match(s)] + ['row_num'] 
    ts_wide_sub = ts_wide[matching_strings]
    for y in matching_strings:
        print(y)
        if y != 'row_num':
            new = re.search(r'^priceHistory/\d+',y).group(0)
            print(new)
            ts_wide_sub.rename(columns={y: new}, inplace=True)
        else:
            continue
    display(ts_wide_sub)
    
    i += 1
    if i == 1:
        ts_long = pd.wide_to_long(
            ts_wide_sub,
            stubnames = 'priceHistory',
            i = ['row_num'], 
            j = x + '_priceHistory',
            sep='/'
        )
        del ts_wide_sub
        display(ts_long)

        # Drop rows with all NaNs
        regex = re.compile(r"^priceHistory/*")
        #ts_long.dropna(subset = [s for s in ts_long.columns if regex.match(s)], axis = 0, inplace = True)

        ts_long.rename(columns={'priceHistory': 'priceHistory/' + x}, inplace=True)
        
    else:
        ts_temp = pd.wide_to_long(
            ts_wide_sub,
            stubnames = 'priceHistory',
            i = ['row_num'],
            j = x + '_priceHistory',
            sep='/'
        )
        del ts_wide_sub
        #ts_long = pd.merge(ts_long, ts_temp, on=['row_num'], validate = '1:1')
        ts_long = pd.concat([ts_long, ts_temp], axis=1)
        del ts_temp
        gc.collect()
        display(ts_long)

        # Drop rows with all NaNs
        regex = re.compile(r"^priceHistory/*")
        #ts_long.dropna(subset = [s for s in ts_long.columns if regex.match(s)], axis = 0, inplace = True)
        ts_long.rename(columns={'priceHistory': 'priceHistory/' + x}, inplace=True)
        
display(ts_long)

priceHistory/0/event
priceHistory/0
priceHistory/1/event
priceHistory/1
priceHistory/2/event
priceHistory/2
priceHistory/3/event
priceHistory/3
priceHistory/4/event
priceHistory/4
priceHistory/5/event
priceHistory/5
priceHistory/6/event
priceHistory/6
priceHistory/7/event
priceHistory/7
priceHistory/8/event
priceHistory/8
priceHistory/9/event
priceHistory/9
row_num


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.ren

Unnamed: 0,priceHistory/0,priceHistory/1,priceHistory/2,priceHistory/3,priceHistory/4,priceHistory/5,priceHistory/6,priceHistory/7,priceHistory/8,priceHistory/9,row_num
0,Listed for sale,Listing removed,Listed for sale,Listing removed,Price change,Price change,Listed for sale,,,,0
1,Price change,Price change,Listed for sale,Listing removed,Listed for sale,Sold,Listing removed,Pending sale,Listed for sale,,1
2,Price change,Listed for sale,,,,,,,,,2
3,Price change,Listed for sale,Sold,Listing removed,Pending sale,Listed for sale,Sold,,,,3
4,Listed for sale,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...
75563,Sold,Listing removed,Listed for sale,,,,,,,,72253
75564,Sold,Listing removed,Pending sale,Listed for sale,Pending sale,Listing removed,Price change,Listed for sale,Price change,Listed for rent,72254
75565,Sold,Pending sale,Listed for sale,,,,,,,,72255
75566,Sold,Sold,,,,,,,,,72256


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory
row_num,event_priceHistory,Unnamed: 2_level_1
0,0,Listed for sale
1,0,Price change
2,0,Price change
3,0,Price change
4,0,Listed for sale
...,...,...
72253,9,
72254,9,Listed for rent
72255,9,
72256,9,


priceHistory/0/postingIsRental
priceHistory/0
priceHistory/1/postingIsRental
priceHistory/1
priceHistory/2/postingIsRental
priceHistory/2
priceHistory/3/postingIsRental
priceHistory/3
priceHistory/4/postingIsRental
priceHistory/4
priceHistory/5/postingIsRental
priceHistory/5
priceHistory/6/postingIsRental
priceHistory/6
priceHistory/7/postingIsRental
priceHistory/7
priceHistory/8/postingIsRental
priceHistory/8
priceHistory/9/postingIsRental
priceHistory/9
row_num


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.ren

Unnamed: 0,priceHistory/0,priceHistory/1,priceHistory/2,priceHistory/3,priceHistory/4,priceHistory/5,priceHistory/6,priceHistory/7,priceHistory/8,priceHistory/9,row_num
0,0,0,0,0,0,0,0,,,,0
1,0,0,0,0,0,0,0,0,0,,1
2,0,0,,,,,,,,,2
3,0,0,0,0,0,0,0,,,,3
4,0,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...
75563,0,0,0,,,,,,,,72253
75564,0,0,0,0,0,1,1,0,1,1,72254
75565,0,0,0,,,,,,,,72255
75566,0,0,,,,,,,,,72256


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory/event,priceHistory
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,Listed for sale,0
1,0,Price change,0
2,0,Price change,0
3,0,Price change,0
4,0,Listed for sale,0
...,...,...,...
72253,9,,
72254,9,Listed for rent,1
72255,9,,
72256,9,,


priceHistory/0/price
priceHistory/0
priceHistory/1/price
priceHistory/1
priceHistory/2/price
priceHistory/2
priceHistory/3/price
priceHistory/3
priceHistory/4/price
priceHistory/4
priceHistory/5/price
priceHistory/5
priceHistory/6/price
priceHistory/6
priceHistory/7/price
priceHistory/7
priceHistory/8/price
priceHistory/8
priceHistory/9/price
priceHistory/9
row_num


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.ren

Unnamed: 0,priceHistory/0,priceHistory/1,priceHistory/2,priceHistory/3,priceHistory/4,priceHistory/5,priceHistory/6,priceHistory/7,priceHistory/8,priceHistory/9,row_num
0,799999.0,599000.0,599000.0,350000.0,350000.0,375000.0,350000.0,,,,0
1,3995000.0,4495000.0,4888000.0,4495000.0,4495000.0,975000.0,1000000.0,1000000.0,1000000.0,,1
2,1495000.0,1499000.0,,,,,,,,,2
3,3450000.0,3800000.0,3100000.0,3675000.0,3675000.0,3675000.0,4200000.0,,,,3
4,1790000.0,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...
75563,825000.0,868000.0,868000.0,,,,,,,,72253
75564,2704000.0,2599000.0,2599000.0,2599000.0,2599000.0,7000.0,7000.0,2599000.0,8500.0,10000.0,72254
75565,2750000.0,2975000.0,2975000.0,,,,,,,,72255
75566,935000.0,600000.0,,,,,,,,,72256


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory/event,priceHistory/postingIsRental,priceHistory
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,Listed for sale,0,799999.0
1,0,Price change,0,3995000.0
2,0,Price change,0,1495000.0
3,0,Price change,0,3450000.0
4,0,Listed for sale,0,1790000.0
...,...,...,...,...
72253,9,,,
72254,9,Listed for rent,1,10000.0
72255,9,,,
72256,9,,,


priceHistory/0/priceChangeRate
priceHistory/0
priceHistory/1/priceChangeRate
priceHistory/1
priceHistory/2/priceChangeRate
priceHistory/2
priceHistory/3/priceChangeRate
priceHistory/3
priceHistory/4/priceChangeRate
priceHistory/4
priceHistory/5/priceChangeRate
priceHistory/5
priceHistory/6/priceChangeRate
priceHistory/6
priceHistory/7/priceChangeRate
priceHistory/7
priceHistory/8/priceChangeRate
priceHistory/8
priceHistory/9/priceChangeRate
priceHistory/9
row_num


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.ren

Unnamed: 0,priceHistory/0,priceHistory/1,priceHistory/2,priceHistory/3,priceHistory/4,priceHistory/5,priceHistory/6,priceHistory/7,priceHistory/8,priceHistory/9,row_num
0,0.335558,0.000000,0.711429,0.000000,-0.066667,0.071429,0.000000,,,,0
1,-0.111235,-0.080401,0.087430,0.000000,3.610256,-0.025000,0.000000,0.000000,0.00,,1
2,-0.002668,0.000000,,,,,,,,,2
3,-0.092105,0.225806,-0.156463,0.000000,0.000000,-0.125000,0.000000,,,,3
4,0.000000,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...
75563,-0.049539,0.000000,0.000000,,,,,,,,72253
75564,0.040400,0.000000,0.000000,0.000000,0.000000,0.000000,-0.176471,1.652041,-0.15,0.0,72254
75565,-0.075630,0.000000,0.000000,,,,,,,,72255
75566,0.558333,0.000000,,,,,,,,,72256


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory/event,priceHistory/postingIsRental,priceHistory/price,priceHistory
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,Listed for sale,0,799999.0,0.335558
1,0,Price change,0,3995000.0,-0.111235
2,0,Price change,0,1495000.0,-0.002668
3,0,Price change,0,3450000.0,-0.092105
4,0,Listed for sale,0,1790000.0,0.000000
...,...,...,...,...,...
72253,9,,,,
72254,9,Listed for rent,1,10000.0,0.000000
72255,9,,,,
72256,9,,,,


priceHistory/0/time
priceHistory/0
priceHistory/1/time
priceHistory/1
priceHistory/2/time
priceHistory/2
priceHistory/3/time
priceHistory/3
priceHistory/4/time
priceHistory/4
priceHistory/5/time
priceHistory/5
priceHistory/6/time
priceHistory/6
priceHistory/7/time
priceHistory/7
priceHistory/8/time
priceHistory/8
priceHistory/9/time
priceHistory/9
row_num


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.ren

Unnamed: 0,priceHistory/0,priceHistory/1,priceHistory/2,priceHistory/3,priceHistory/4,priceHistory/5,priceHistory/6,priceHistory/7,priceHistory/8,priceHistory/9,row_num
0,2021-01-08,2016-04-01,2015-03-20,2010-12-23,2010-06-10,2010-06-04,2010-02-05,NaT,NaT,NaT,0
1,2020-12-07,2020-10-01,2020-07-28,2020-03-14,2019-11-01,2015-12-18,2015-11-11,2015-08-18,2015-05-01,NaT,1
2,2021-01-20,2020-06-20,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,2
3,2020-12-17,2020-09-22,2019-03-07,2019-02-21,2017-11-10,2017-09-25,2000-08-01,NaT,NaT,NaT,3
4,2020-11-18,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,4
...,...,...,...,...,...,...,...,...,...,...,...
75563,2018-07-31,2018-02-27,2018-02-14,NaT,NaT,NaT,NaT,NaT,NaT,NaT,72253
75564,2019-07-01,2019-06-20,2019-04-28,2019-04-04,2019-03-26,2019-03-21,2019-03-12,2019-02-26,2019-01-14,2018-10-22,72254
75565,2018-07-24,2018-04-10,2018-03-22,NaT,NaT,NaT,NaT,NaT,NaT,NaT,72255
75566,2018-12-26,2008-05-21,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,72256


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory/event,priceHistory/postingIsRental,priceHistory/price,priceHistory/priceChangeRate,priceHistory
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,Listed for sale,0,799999.0,0.335558,2021-01-08
1,0,Price change,0,3995000.0,-0.111235,2020-12-07
2,0,Price change,0,1495000.0,-0.002668,2021-01-20
3,0,Price change,0,3450000.0,-0.092105,2020-12-17
4,0,Listed for sale,0,1790000.0,0.000000,2020-11-18
...,...,...,...,...,...,...
72253,9,,,,,NaT
72254,9,Listed for rent,1,10000.0,0.000000,2018-10-22
72255,9,,,,,NaT
72256,9,,,,,NaT


priceHistory/0/year
priceHistory/0
priceHistory/1/year
priceHistory/1
priceHistory/2/year
priceHistory/2
priceHistory/3/year
priceHistory/3
priceHistory/4/year
priceHistory/4
priceHistory/5/year
priceHistory/5
priceHistory/6/year
priceHistory/6
priceHistory/7/year
priceHistory/7
priceHistory/8/year
priceHistory/8
priceHistory/9/year
priceHistory/9
row_num


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.ren

Unnamed: 0,priceHistory/0,priceHistory/1,priceHistory/2,priceHistory/3,priceHistory/4,priceHistory/5,priceHistory/6,priceHistory/7,priceHistory/8,priceHistory/9,row_num
0,2021.0,2016.0,2015.0,2010.0,2010.0,2010.0,2010.0,,,,0
1,2020.0,2020.0,2020.0,2020.0,2019.0,2015.0,2015.0,2015.0,2015.0,,1
2,2021.0,2020.0,,,,,,,,,2
3,2020.0,2020.0,2019.0,2019.0,2017.0,2017.0,2000.0,,,,3
4,2020.0,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...
75563,2018.0,2018.0,2018.0,,,,,,,,72253
75564,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2018.0,72254
75565,2018.0,2018.0,2018.0,,,,,,,,72255
75566,2018.0,2008.0,,,,,,,,,72256


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory/event,priceHistory/postingIsRental,priceHistory/price,priceHistory/priceChangeRate,priceHistory/time,priceHistory
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,Listed for sale,0,799999.0,0.335558,2021-01-08,2021.0
1,0,Price change,0,3995000.0,-0.111235,2020-12-07,2020.0
2,0,Price change,0,1495000.0,-0.002668,2021-01-20,2021.0
3,0,Price change,0,3450000.0,-0.092105,2020-12-17,2020.0
4,0,Listed for sale,0,1790000.0,0.000000,2020-11-18,2020.0
...,...,...,...,...,...,...,...
72253,9,,,,,NaT,
72254,9,Listed for rent,1,10000.0,0.000000,2018-10-22,2018.0
72255,9,,,,,NaT,
72256,9,,,,,NaT,


priceHistory/0/month_year
priceHistory/0
priceHistory/1/month_year
priceHistory/1
priceHistory/2/month_year
priceHistory/2
priceHistory/3/month_year
priceHistory/3
priceHistory/4/month_year
priceHistory/4
priceHistory/5/month_year
priceHistory/5
priceHistory/6/month_year
priceHistory/6
priceHistory/7/month_year
priceHistory/7
priceHistory/8/month_year
priceHistory/8
priceHistory/9/month_year
priceHistory/9
row_num


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.rename(columns={y: new}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts_wide_sub.ren

Unnamed: 0,priceHistory/0,priceHistory/1,priceHistory/2,priceHistory/3,priceHistory/4,priceHistory/5,priceHistory/6,priceHistory/7,priceHistory/8,priceHistory/9,row_num
0,2021-01,2016-04,2015-03,2010-12,2010-06,2010-06,2010-02,NaT,NaT,NaT,0
1,2020-12,2020-10,2020-07,2020-03,2019-11,2015-12,2015-11,2015-08,2015-05,NaT,1
2,2021-01,2020-06,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,2
3,2020-12,2020-09,2019-03,2019-02,2017-11,2017-09,2000-08,NaT,NaT,NaT,3
4,2020-11,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,4
...,...,...,...,...,...,...,...,...,...,...,...
75563,2018-07,2018-02,2018-02,NaT,NaT,NaT,NaT,NaT,NaT,NaT,72253
75564,2019-07,2019-06,2019-04,2019-04,2019-03,2019-03,2019-03,2019-02,2019-01,2018-10,72254
75565,2018-07,2018-04,2018-03,NaT,NaT,NaT,NaT,NaT,NaT,NaT,72255
75566,2018-12,2008-05,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,72256


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory/event,priceHistory/postingIsRental,priceHistory/price,priceHistory/priceChangeRate,priceHistory/time,priceHistory/year,priceHistory
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,Listed for sale,0,799999.0,0.335558,2021-01-08,2021.0,2021-01
1,0,Price change,0,3995000.0,-0.111235,2020-12-07,2020.0,2020-12
2,0,Price change,0,1495000.0,-0.002668,2021-01-20,2021.0,2021-01
3,0,Price change,0,3450000.0,-0.092105,2020-12-17,2020.0,2020-12
4,0,Listed for sale,0,1790000.0,0.000000,2020-11-18,2020.0,2020-11
...,...,...,...,...,...,...,...,...
72253,9,,,,,NaT,,NaT
72254,9,Listed for rent,1,10000.0,0.000000,2018-10-22,2018.0,2018-10
72255,9,,,,,NaT,,NaT
72256,9,,,,,NaT,,NaT


Unnamed: 0_level_0,Unnamed: 1_level_0,priceHistory/event,priceHistory/postingIsRental,priceHistory/price,priceHistory/priceChangeRate,priceHistory/time,priceHistory/year,priceHistory/month_year
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,Listed for sale,0,799999.0,0.335558,2021-01-08,2021.0,2021-01
1,0,Price change,0,3995000.0,-0.111235,2020-12-07,2020.0,2020-12
2,0,Price change,0,1495000.0,-0.002668,2021-01-20,2021.0,2021-01
3,0,Price change,0,3450000.0,-0.092105,2020-12-17,2020.0,2020-12
4,0,Listed for sale,0,1790000.0,0.000000,2020-11-18,2020.0,2020-11
...,...,...,...,...,...,...,...,...
72253,9,,,,,NaT,,NaT
72254,9,Listed for rent,1,10000.0,0.000000,2018-10-22,2018.0,2018-10
72255,9,,,,,NaT,,NaT
72256,9,,,,,NaT,,NaT


In [106]:
ts_wide_merge = ts_wide[['zpid', 'address/zipcode', 'price', 'days_on_Zillow', 'row_num']]
ts_long = pd.merge(ts_long, ts_wide_merge, on=['row_num'], validate = 'm:1')

In [107]:
ts_wide.to_csv('processed_dataset_ts_wide.csv')
ts_long.to_csv('processed_dataset_ts_long.csv')

In [108]:
display(ts_long)

Unnamed: 0,row_num,priceHistory/event,priceHistory/postingIsRental,priceHistory/price,priceHistory/priceChangeRate,priceHistory/time,priceHistory/year,priceHistory/month_year,zpid,address/zipcode,price,days_on_Zillow
0,0,Listed for sale,0,799999.0,0.335558,2021-01-08,2021.0,2021-01,31554048.0,10463.0,799999.0,12.0
1,0,Listing removed,0,599000.0,0.000000,2016-04-01,2016.0,2016-04,31554048.0,10463.0,799999.0,12.0
2,0,Listed for sale,0,599000.0,0.711429,2015-03-20,2015.0,2015-03,31554048.0,10463.0,799999.0,12.0
3,0,Listing removed,0,350000.0,0.000000,2010-12-23,2010.0,2010-12,31554048.0,10463.0,799999.0,12.0
4,0,Price change,0,350000.0,-0.066667,2010-06-10,2010.0,2010-06,31554048.0,10463.0,799999.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...
722575,72257,Listed for sale,0,925000.0,3.512195,2018-07-28,2018.0,2018-07,32004001.0,11375.0,820000.0,
722576,72257,Sold,0,205000.0,0.000000,1998-04-02,1998.0,1998-04,32004001.0,11375.0,820000.0,
722577,72257,,,,,NaT,,NaT,32004001.0,11375.0,820000.0,
722578,72257,,,,,NaT,,NaT,32004001.0,11375.0,820000.0,
