In [48]:
# Import necessary libraries
import pandas as pd
from reader import Reader

# Load the data using the Reader class
reader = Reader('input/Nashville_housing_data_2013_2016.csv')
data = reader.load_data()

# Display the first 5 rows of the dataset
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56636 entries, 0 to 56635
Data columns (total 31 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0.1                       56636 non-null  int64  
 1   Unnamed: 0                         56636 non-null  int64  
 2   Parcel ID                          56636 non-null  object 
 3   Land Use                           56636 non-null  object 
 4   Property Address                   56477 non-null  object 
 5   Suite/ Condo   #                   6109 non-null   object 
 6   Property City                      56477 non-null  object 
 7   Sale Date                          56636 non-null  object 
 8   Sale Price                         56636 non-null  int64  
 9   Legal Reference                    56636 non-null  object 
 10  Sold As Vacant                     56636 non-null  object 
 11  Multiple Parcels Involved in Sale  56636 non-null  obj

In [49]:
# Check for null values in each column
null_counts = data.isnull().sum()

# Print the null counts for each column
print("Null values in each column:")
print(null_counts)

Null values in each column:
Unnamed: 0.1                             0
Unnamed: 0                               0
Parcel ID                                0
Land Use                                 0
Property Address                       159
Suite/ Condo   #                     50527
Property City                          159
Sale Date                                0
Sale Price                               0
Legal Reference                          0
Sold As Vacant                           0
Multiple Parcels Involved in Sale        0
Owner Name                           31375
Address                              30619
City                                 30619
State                                30619
Acreage                              30619
Tax District                         30619
Neighborhood                         30619
image                                31301
Land Value                           30619
Building Value                       30619
Total Value               

In [50]:
print(data.head())

   Unnamed: 0.1  Unnamed: 0         Parcel ID           Land Use  \
0             0           0  105 03 0D 008.00  RESIDENTIAL CONDO   
1             1           1   105 11 0 080.00      SINGLE FAMILY   
2             2           2   118 03 0 130.00      SINGLE FAMILY   
3             3           3   119 01 0 479.00      SINGLE FAMILY   
4             4           4   119 05 0 186.00      SINGLE FAMILY   

    Property Address Suite/ Condo   # Property City   Sale Date  Sale Price  \
0    1208  3RD AVE S                8     NASHVILLE  2013-01-24      132000   
1   1802  STEWART PL              NaN     NASHVILLE  2013-01-11      191500   
2  2761  ROSEDALE PL              NaN     NASHVILLE  2013-01-18      202000   
3  224  PEACHTREE ST              NaN     NASHVILLE  2013-01-18       32000   
4      316  LUTIE ST              NaN     NASHVILLE  2013-01-23      102000   

    Legal Reference Sold As Vacant Multiple Parcels Involved in Sale  \
0  20130128-0008725             No          

In [51]:
# Check for missing (NaN) values in the entire dataset
missing_values = data.isnull().sum()

# Show columns with missing values
print("Missing values in each column:")
print(missing_values)

# Show rows with missing values:
missing_rows = data[data.isnull().any(axis=1)]
print("\nRows with missing values:")
print(missing_rows)

Missing values in each column:
Unnamed: 0.1                             0
Unnamed: 0                               0
Parcel ID                                0
Land Use                                 0
Property Address                       159
Suite/ Condo   #                     50527
Property City                          159
Sale Date                                0
Sale Price                               0
Legal Reference                          0
Sold As Vacant                           0
Multiple Parcels Involved in Sale        0
Owner Name                           31375
Address                              30619
City                                 30619
State                                30619
Acreage                              30619
Tax District                         30619
Neighborhood                         30619
image                                31301
Land Value                           30619
Building Value                       30619
Total Value            

In [52]:
# Show all column names in the dataset
print("All column names in the dataset:")
print(data.columns.tolist())

All column names in the dataset:
['Unnamed: 0.1', 'Unnamed: 0', 'Parcel ID', 'Land Use', 'Property Address', 'Suite/ Condo   #', 'Property City', 'Sale Date', 'Sale Price', 'Legal Reference', 'Sold As Vacant', 'Multiple Parcels Involved in Sale', 'Owner Name', 'Address', 'City', 'State', 'Acreage', 'Tax District', 'Neighborhood', 'image', 'Land Value', 'Building Value', 'Total Value', 'Finished Area', 'Foundation Type', 'Year Built', 'Exterior Wall', 'Grade', 'Bedrooms', 'Full Bath', 'Half Bath']


In [53]:
#investigate the data 
print(data.dtypes)

Unnamed: 0.1                           int64
Unnamed: 0                             int64
Parcel ID                             object
Land Use                              object
Property Address                      object
Suite/ Condo   #                      object
Property City                         object
Sale Date                             object
Sale Price                             int64
Legal Reference                       object
Sold As Vacant                        object
Multiple Parcels Involved in Sale     object
Owner Name                            object
Address                               object
City                                  object
State                                 object
Acreage                              float64
Tax District                          object
Neighborhood                         float64
image                                 object
Land Value                           float64
Building Value                       float64
Total Valu

In [54]:

import pandas as pd
import numpy as np
from datetime import datetime
from reader import Reader
import re


class Validator:

    def __init__(self, data=None):
        self.data = data
        self.validation_results = {
            'valid_records': 0,
            'invalid_records': 0,
            'validation_errors': []
        }
        
        # Define non-mandatory columns that won't trigger errors when missing
        self.non_mandatory_columns = [
            'Suite/ Condo   #', 'Owner Name', 'Address', 'City', 'State', 
            'Tax District', 'Foundation Type', 'Exterior Wall', 'Grade'
        ]
    
    def load_data(self, data):
        self.data = data
        return self
    
    def validate_dataset(self):
        if self.data is None:
            raise ValueError("No data loaded. Call load_data() first.")
            
        self.validation_results = {
            'valid_records': 0,
            'invalid_records': 0,
            'validation_errors': []
        }
        
        # Validate each record
        for index, record in self.data.iterrows():
            record_errors = self._validate_record(record, index)
            
            if record_errors:
                self.validation_results['invalid_records'] += 1
                self.validation_results['validation_errors'].extend(record_errors)
            else:
                self.validation_results['valid_records'] += 1
        
        return self
    
    def _validate_record(self, record, index):
        errors = []
        
        # Non-mandatory field validations
        
        # Suite/Condo (non-mandatory)
        if pd.notna(record.get('Suite/ Condo   #')):
            if not isinstance(record['Suite/ Condo   #'], (str, int, float)):
                errors.append(f"Record {index}: Suite/Condo should be string or numeric")
        
        # Validate Owner Name (non-mandatory)
        if pd.notna(record.get('Owner Name')):
            if not isinstance(record['Owner Name'], str):
                errors.append(f"Record {index}: Owner Name should be a string")
        
        # Validate Address (non-mandatory)
        if pd.notna(record.get('Address')):
            if not isinstance(record['Address'], str):
                errors.append(f"Record {index}: Address should be a string")
        
        # Validate City (non-mandatory)
        if pd.notna(record.get('City')):
            if not isinstance(record['City'], str):
                errors.append(f"Record {index}: City should be a string")
        
        # Validate State (non-mandatory)
        if pd.notna(record.get('State')):
            if not isinstance(record['State'], str):
                errors.append(f"Record {index}: State should be a string")
            elif len(record['State']) != 2:  # 2-letter state codes
                errors.append(f"Record {index}: State should be a 2-letter code")
        
        # Validate Tax District (non-mandatory)
        if pd.notna(record.get('Tax District')):
            if not isinstance(record['Tax District'], str):
                errors.append(f"Record {index}: Tax District should be a string")
        
        # Validate Foundation Type (non-mandatory)
        if pd.notna(record.get('Foundation Type')):
            if not isinstance(record['Foundation Type'], str):
                errors.append(f"Record {index}: Foundation Type should be a string")
            # Check if the value is valid (case-insensitive)
            elif not re.fullmatch(r'[A-Z ]+', record['Foundation Type'].strip()):
                errors.append(f"Record {index}: Foundation Type '{record['Foundation Type']}' is invalid (must contain only uppercase letters and spaces)")

        
        # Validate Exterior Wall (non-mandatory)
        if pd.notna(record.get('Exterior Wall')):
            if not isinstance(record['Exterior Wall'], str):
                errors.append(f"Record {index}: Exterior Wall should be a string")
            # Check if the value is valid (case-insensitive)
            elif not re.fullmatch(r'[A-Z/ ]+', record['Exterior Wall'].strip()):
                errors.append(f"Record {index}: Exterior Wall '{record['Exterior Wall']}' is invalid (must contain only uppercase letters, spaces, or slashes)")

        
        # # Validate Grade (non-mandatory)
        # if pd.notna(record.get('Grade')):
        #     if not isinstance(record['Grade'], str):
        #         errors.append(f"Record {index}: Grade should be a string")
        #     else:
        #         # Handle grades with spaces (e.g., 'B   ')
        #         grade_clean = record['Grade'].strip()
        #         if not re.fullmatch(r'[A-Z][+-]?', grade_clean):
        #             errors.append(f"Record {index}: Grade '{record['Grade']}' is invalid (must be a single uppercase letter optionally followed by + or -)")

            
        # Validate fields used in calculations (if present)
        
        # Sale Price and Finished Area for price per square foot calculation
        if pd.notna(record.get('Sale Price')) and pd.notna(record.get('Finished Area')):
            try:
                float(record['Sale Price'])
            except (ValueError, TypeError):
                errors.append(f"Record {index}: Sale Price should be numeric")
                
            try:
                float(record['Finished Area'])
                if float(record['Finished Area']) == 0:
                    errors.append(f"Record {index}: Finished Area cannot be zero (division by zero in price per sqft)")
            except (ValueError, TypeError):
                errors.append(f"Record {index}: Finished Area should be numeric")
        
        # Year Built for property age calculation
        if pd.notna(record.get('Year Built')):
            try:
                year_built = float(record['Year Built'])
                if year_built < 1700 or year_built > datetime.now().year:
                    errors.append(f"Record {index}: Year Built ({year_built}) is outside reasonable range")
            except (ValueError, TypeError):
                errors.append(f"Record {index}: Year Built should be numeric")
        
        # Sale Date validation
        if pd.notna(record.get('Sale Date')):
            if isinstance(record['Sale Date'], str):
                try:
                    datetime.strptime(record['Sale Date'], '%Y-%m-%d')
                except ValueError:
                    errors.append(f"Record {index}: Sale Date should be in YYYY-MM-DD format")
        
        # # Building Value for land-to-building ratio (avoid division by zero)
        # if pd.notna(record.get('Building Value')):
        #     try:
        #         building_value = float(record['Building Value'])
        #         if building_value == 0:
        #             errors.append(f"Record {index}: Building Value is zero (potential division by zero in ratio)")
        #     except (ValueError, TypeError):
        #         errors.append(f"Record {index}: Building Value should be numeric")
                
        return errors

    def get_validation_results(self):
        return self.validation_results
    
    def get_validation_summary(self):
        return {
            'total_records': self.validation_results['valid_records'] + self.validation_results['invalid_records'],
            'valid_records': self.validation_results['valid_records'],
            'invalid_records': self.validation_results['invalid_records'],
            'error_count': len(self.validation_results['validation_errors']),
        }
    
    def get_validated_data(self):

        return self.data


# Example usage
if __name__ == "__main__":
    
    # Example usage with reader
    reader = Reader('input/Nashville_housing_data_2013_2016.csv')
    data = reader.load_data()
    
    # Initialize validator and validate data
    validator = Validator()
    validator.load_data(data).validate_dataset()
    
    # Print validation summary
    print("Validation Summary:")
    print(validator.get_validation_summary())
    
    # Print the first 5 validation errors if any
    errors = validator.get_validation_results()['validation_errors']
    if errors:
        print("\nSample validation errors (first 10):")
        for i, error in enumerate(errors[:10]):
            print(error)
        if len(errors) > 10:
            print(f"...and {len(errors) - 10} more errors")
    
    # Get validated data for processing
    validated_data = validator.get_validated_data()

Validation Summary:
{'total_records': 56636, 'valid_records': 56634, 'invalid_records': 2, 'error_count': 2}

Sample validation errors (first 10):
Record 5846: Finished Area cannot be zero (division by zero in price per sqft)
Record 14568: Finished Area cannot be zero (division by zero in price per sqft)
