# Processor

- Remove all rows containing a missing value in a mandatory column
- Remove columns ‘image’, ‘Sold As Vacant’ and ‘Multiple Parcels Involved in Sale’
- Add columns:
   - Price per square foot
   - Age of property
   - Sale year and sale month
   - Land-to-building value ratio
   - Sale price category: Low (< 100 000), Medium (100 000- 300 000), High (>300 000)
   - Family Name and First name of owner (first person listed)

In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import os
from reader import Reader


In [18]:
class Processor:
    
    def __init__(self):
        reader = Reader()
        self.data = reader.load_data()
        non_mandatory = ['Suite/Condo #', 'Owner Name', 'Address', 'City', 'State', 'Tax District', 'image', 'Foundation Type', 'Exterior Wall', 'Grade']
        self.mandatory_columns = [col for col in self.data.columns if col not in non_mandatory]
    
    # Remove all rows containing a missing value in a mandatory column.    
    def remove_rows_with_missing_mandatory_values(self):
        self.data = self.data.dropna(subset=self.mandatory_columns)
        return self
        
    # Remove columns ‘image’, ‘Sold As Vacant’ and ‘Multiple Parcels Involved in Sale’.
    def remove_columns(self):
        columns_to_remove = ['image', 'Sold As Vacant', 'Multiple Parcels Involved in Sale']
        existing_columns = [col for col in columns_to_remove if col in self.data.columns]
        self.data = self.data.drop(columns=existing_columns)
        return self
        
    # Price per square foot.
    def add_price_per_sqft(self):
        self.data['Price per Square Foot'] = self.data['Sale Price'] / self.data['Finished Area']
        return self
    
    # Age of property.
    def add_property_age(self):
        if not pd.api.types.is_datetime64_any_dtype(self.data['Sale Date']):
            self.data['Sale Date'] = pd.to_datetime(self.data['Sale Date'])
            
        sale_years = self.data['Sale Date'].dt.year
        
        self.data['Property Age'] = sale_years - self.data['Year Built']
        return self
        
    # Sale year and sale month.
    def add_sale_year_month(self):
        if not pd.api.types.is_datetime64_any_dtype(self.data['Sale Date']):
            self.data['Sale Date'] = pd.to_datetime(self.data['Sale Date'])
            
        self.data['Sale Year'] = self.data['Sale Date'].dt.year
        self.data['Sale Month'] = self.data['Sale Date'].dt.month
        return self
        
    # Land-to-building value ratio.
    def add_land_building_ratio(self):
        self.data['Land-to-Building Ratio'] = self.data['Land Value'] / self.data['Building Value'].replace(0, np.nan)
        return self
     
    # Sale price category: Low (< 100 000), Medium (100 000 - 300 000), High (>300 000).   
    def add_price_category(self):
        conditions = [
            self.data['Sale Price'] < 100000,
            (self.data['Sale Price'] >= 100000) & (self.data['Sale Price'] <= 300000),
            self.data['Sale Price'] > 300000
        ]
        choices = ['Low', 'Medium', 'High']
        self.data['Sale Price Category'] = np.select(conditions, choices, default='Unknown')
        return self
    
    
    # Family Name and First name of owner (first person listed)
    def extract_owner_names(self):
        def extract_names(owner_string):
            if pd.isna(owner_string):
                return pd.Series([np.nan, np.nan])
                
            owner_string = str(owner_string).strip()
            
            if '&' in owner_string:
                owner_string = owner_string.split('&')[0].strip()
            if ',' in owner_string:
                parts = owner_string.split(',', 1)
                family_name = parts[0].strip()
                first_name = parts[1].strip() if len(parts) > 1 else np.nan
            else:
                parts = owner_string.split()
                if len(parts) > 1:
                    first_name = parts[0].strip()
                    family_name = ' '.join(parts[1:]).strip()
                else:
                    family_name = owner_string
                    first_name = np.nan
                    
            return pd.Series([family_name, first_name])
            
        self.data[['Family Name', 'First Name']] = self.data['Owner Name'].apply(extract_names)
        return self
    
    # Process all the clean up functions.
    def process(self):
        self.remove_rows_with_missing_mandatory_values()
        self.remove_columns()
        self.add_price_per_sqft()
        self.add_property_age()
        self.add_sale_year_month()
        self.add_land_building_ratio()
        self.add_price_category()
        self.extract_owner_names()
        self.data = self.data.reset_index(drop=True)
        return self
    
    #Save the data to the /output folder.
    def save_data(self):
        output_path = 'output/processed_nashville_housing.csv'
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        self.data.to_csv(output_path, index=False)
        return self
    
    # To show the processed data.   
    def get_processed_data(self):
        return self.data

In [19]:
processor = Processor()
processor.process()
processor.save_data()
processed_data = processor.get_processed_data()
processed_data

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Parcel ID,Land Use,Property Address,Suite/ Condo #,Property City,Sale Date,Sale Price,Legal Reference,Owner Name,Address,City,State,Acreage,Tax District,Neighborhood,Land Value,Building Value,Total Value,Finished Area,Foundation Type,Year Built,Exterior Wall,Grade,Bedrooms,Full Bath,Half Bath,Price per Square Foot,Property Age,Sale Year,Sale Month,Land-to-Building Ratio,Sale Price Category,Family Name,First Name
0,7822,7822,118 09 0 252.00,ZERO LOT LINE,1129 CLIFTON LN,3-B,NASHVILLE,2013-09-04,201000,20130909-0094904,"CRUTCHFIELD, BILL & LEAH",1129 3-B CLIFTON LN,NASHVILLE,TN,0.14,URBAN SERVICES DISTRICT,3726.0,96000.0,113700.0,209700.0,1332.0,CRAWL,1984.0,BRICK,C,2.0,3.0,0.0,150.900901,29.0,2013,9,0.844327,Medium,CRUTCHFIELD,BILL
1,24933,24933,159 00 0 205.00,ZERO LOT LINE,1245 OLD HICKORY BLVD,2-A,BRENTWOOD,2014-12-12,275000,20141217-0115474,"RONVEAUX, KAY B. & RICHARD C.",1245 2-A OLD HICKORY BLVD,BRENTWOOD,TN,0.52,GENERAL SERVICES DISTRICT,6528.0,146900.0,145100.0,292000.0,2333.0,CRAWL,1985.0,FRAME,B,2.0,2.0,0.0,117.873982,29.0,2014,12,1.012405,Medium,RONVEAUX,KAY B.
