# 2.2 Pre-Prep
Do some basic feature engineering based on the column values in that row and not any adjacent rows.
Then split into train and test set. After this script, we will start exploring the variables, feature engineering (based on values in other rows) and modelling so we don't want any biases or data leakage from the test set making its way into the training set.

In [1]:
from datetime import date
import os
import math
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

In [2]:
DATA_FOLDER = os.path.join('data', 'processed')
SAVE_FOLDER = os.path.join('data', 'processed_2')

Read in our dataset

In [3]:
zoopla_df = pd.read_csv(os.path.join(DATA_FOLDER, 
    'zoopla_properties_with_postcode_epc_dep_flood.csv'), dtype=str)

### Get a diff in months between last_published_date and first_published date
The longer the diff, I expect the lower the price as the original estimation was likely too high.

In [4]:
# first convert columns from strings into datetimes
zoopla_df['last_published_date'] = pd.to_datetime(zoopla_df['last_published_date'], format='%Y-%m-%d %H:%M:%S')
zoopla_df['first_published_date'] = pd.to_datetime(zoopla_df['first_published_date'], format='%Y-%m-%d %H:%M:%S')

# calculate months diff
zoopla_df['diff_published_date'] = zoopla_df[['last_published_date', 'first_published_date']].apply(
    lambda x: math.floor((x[0] - x[1])/np.timedelta64(1, 'M')), axis=1)

zoopla_df[['last_published_date', 'first_published_date', 'diff_published_date']].tail(10)

Unnamed: 0,last_published_date,first_published_date,diff_published_date
875,2023-01-10 16:58:02,2022-02-02 15:21:03,11
876,2022-10-31 10:14:33,2022-01-10 17:35:52,9
877,2023-01-19 16:54:36,2022-01-07 09:08:00,12
878,2023-01-05 02:33:10,2021-12-26 06:08:04,12
879,2023-01-05 02:33:10,2021-12-26 06:07:38,12
880,2022-11-05 09:36:39,2021-11-10 08:27:04,11
881,2022-10-17 15:04:20,2021-10-27 16:10:50,11
882,2023-01-27 16:21:47,2021-07-09 11:01:13,18
883,2022-09-23 09:10:05,2021-06-30 13:06:57,14
884,2022-12-26 10:49:34,2020-08-27 13:09:17,27


### Split up last_published_date and first_published_date into year and month components
This will reflect change in house prices with year and seasonal variation.

In [5]:
zoopla_df['last_published_year'] = zoopla_df['last_published_date'].dt.year
zoopla_df['last_published_month'] = zoopla_df['last_published_date'].dt.month
display(zoopla_df[['last_published_date', 'last_published_year', 'last_published_month']].tail())

zoopla_df['first_published_year'] = zoopla_df['first_published_date'].dt.year
zoopla_df['first_published_month'] = zoopla_df['first_published_date'].dt.month
display(zoopla_df[['first_published_date', 'first_published_year', 'first_published_month']].tail())

# drop the original date variables
zoopla_df.drop(['last_published_date', 'first_published_date'], axis=1, inplace=True)

Unnamed: 0,last_published_date,last_published_year,last_published_month
880,2022-11-05 09:36:39,2022,11
881,2022-10-17 15:04:20,2022,10
882,2023-01-27 16:21:47,2023,1
883,2022-09-23 09:10:05,2022,9
884,2022-12-26 10:49:34,2022,12


Unnamed: 0,first_published_date,first_published_year,first_published_month
880,2021-11-10 08:27:04,2021,11
881,2021-10-27 16:10:50,2021,10
882,2021-07-09 11:01:13,2021,7
883,2021-06-30 13:06:57,2021,6
884,2020-08-27 13:09:17,2020,8


### Aggregate postcodes
The number of distinct postcodes is too high compared with the total number of rows so will have limited predictive power. Instead keep the first part of the postcode before the space and the first number after the space.

In [6]:
# try all characters before space
zoopla_df['postcode_tmp'] = zoopla_df['postcode'].str.extract(r'(^[A-Z]+[0-9]+)')
display(zoopla_df['postcode_tmp'].value_counts())

# try all characters before space and first character after space
zoopla_df['postcode_tmp'] = zoopla_df['postcode'].str.extract(r'(^[A-Z]+[0-9]+\s\d)')
display(zoopla_df['postcode_tmp'].value_counts())

CV10    310
LE10    295
CV11    249
CV13     18
CV12     10
CV9       2
LE9       1
Name: postcode_tmp, dtype: int64

LE10 0    172
CV11 6    124
LE10 1    103
CV10 9     95
CV10 0     85
CV10 7     68
CV11 4     66
CV10 8     62
CV11 5     41
CV11 7     18
CV13 6     15
LE10 2     12
CV12 9      9
LE10 3      8
CV13 0      3
CV9 3       2
LE9 7       1
CV12 8      1
Name: postcode_tmp, dtype: int64

Try the second approach - we have a good mix of unique values and reasonable value counts for most

In [7]:
zoopla_df['postcode'] = zoopla_df['postcode'].str.extract(r'(^[A-Z]+[0-9]+\s\d)')
zoopla_df.drop('postcode_tmp', axis=1, inplace=True)

Remove towns that are not Nuneaton or Hinckley

In [8]:
display(zoopla_df['post_town'].value_counts(dropna=False))

Nuneaton      580
Hinckley      292
Bedworth        7
Atherstone      5
Derby           1
Name: post_town, dtype: int64

In [9]:
zoopla_df = zoopla_df[zoopla_df['post_town'].isin(['Nuneaton', 'Hinckley'])]

Aggregate parishes

In [10]:
display(zoopla_df['parish'].value_counts(dropna=False))

Nuneaton and Bedworth, unparished area    558
Hinckley and Bosworth, unparished area    280
Burbage                                    14
Stoke Golding                               9
Higham on the Hill                          4
Witherley                                   2
Nailstone                                   1
Earl Shilton                                1
Market Bosworth                             1
Barlestone                                  1
Sheepy                                      1
Name: parish, dtype: int64

In [11]:
def agg_parish(parish):
    
    if parish in ['Nuneaton and Bedworth, unparished area', 'Hinckley and Bosworth, unparished area', 
                  'Burbage', 'Stoke Golding']:
        return parish
    else:
        return 'Other'
                  
zoopla_df['parish'] = zoopla_df['parish'].apply(agg_parish)

### Split into train and test set and save

In [12]:
# shuffle and then split into 70:30 ratio
zoopla_df = zoopla_df.sample(frac=1.0, replace=False, random_state=42)

split_index = int(0.7*len(zoopla_df))

zoopla_df_train = zoopla_df.iloc[:split_index]
zoopla_df_test = zoopla_df.iloc[split_index:]

print(f'Length full dataset = {len(zoopla_df)}')
print(f'Length train dataset = {len(zoopla_df_train)}')
print(f'Length test dataset = {len(zoopla_df_test)}')

Length full dataset = 872
Length train dataset = 610
Length test dataset = 262


In [13]:
try:
    os.mkdir(SAVE_FOLDER)
except OSError:
    pass

save_file_train = os.path.join(SAVE_FOLDER, f'zoopla_train.csv')
save_file_test = os.path.join(SAVE_FOLDER, f'zoopla_test.csv')

zoopla_df_train.to_csv(save_file_train, index=False)
zoopla_df_test.to_csv(save_file_test, index=False)