READER

In [118]:
import os
import pandas as pd
import regex as re
from datetime import datetime
import numpy as np

In [119]:
class Reader():
    def __init__(self, file_path):
        self.__file_path = file_path
        self.__files = [ f for f in os.listdir(file_path) if os.path.isfile(os.path.join(file_path,f)) ]
        self.dataFrames = []

    # Returns a list with all the file names in a specific folder
    def listFile(self):
        return self.__files

    # Returns a list of all the dataframes in the folder from csv files
    def DfList(self):
        if not self.dataFrames:
            for file in self.__files:
                if file.endswith(".csv"):
                    self.dataFrames.append(pd.read_csv(os.path.join(self.__file_path, file)))
                else:
                    raise ValueError("File must be a CSV.")
        return self.dataFrames

    # Returns a specific dataframe by index 
    def getDfByIndex(self, index):
        self.DfList()
        index -= 1
        if index < 0 or index >= len(self.dataFrames):
            raise ValueError("Index out of range.")
        return self.dataFrames[index]
    
    # Returns the amount of files in the folder
    def getLength(self):
        return len(self.__files)
    

WRITER

In [120]:
class Writer():
    def __init__(self, dataFrame, fileName, folder_path):
        self.df = dataFrame
        self.fn = fileName
        self.folder_path = folder_path
        self.writeCsv()

    # Writes a dataframe to a CSV file
    def writeCsv(self):
        if not isinstance(self.df, pd.DataFrame):
            raise ValueError("Dataframe must be a pandas DataFrame")
        
        self.df.to_csv(os.path.join(self.folder_path, self.fn + '.csv'), index=False)
        return print("File saved successfully")


In [121]:
reader = Reader('files')
# filenames = ['test1', 'test2']

# for filename, file in zip(filenames, reader.DfList()):
#     Writer(file, filename, 'output')

df = reader.getDfByIndex(1)



VALIDATOR

In [122]:
non_manadatory_columns = {
    'Suite/ Condo   #', 'Owner Name', 'Address', 'City', 'State',
    'Tax District', 'image', 'Foundation Type', 'Exterior Wall', 'Grade'
}

def validate_parcel_id(value):
    if not isinstance(value, str):
        value = str(value)
    value = value.strip()
    
    pattern = r'^\d{3} \d{2} [0-9A-Z]{1,2} \d{3}\.\d{2}$'
    return re.match(pattern, value) is not None

def validate_land_use(value):
    if not isinstance(value, str):
        return False
    stripped = value.strip()
    return stripped != "" and stripped == stripped.upper()

def validate_date(value):
    try:
        date = pd.to_datetime(value, format='%Y-%m-%d', errors='raise')
        return date <= datetime.today()
    except:
        return False

def validate_price(value):
    return isinstance(value, (int, float)) and value >= 0

def validate_legal_reference(value):
    if not isinstance(value, str):
        value = str(value)

    value = value.strip().replace(" ", "")

    match = re.fullmatch(r'-?(\d{7,8})-(\d{6,8})', value)

    if not match:
        return False
    return True

def validate_sold_as_vacant(value):
    if not isinstance(value, str):
        return False
    return value in ['Yes', 'No']

def validate_acreage(value):
    try:
        return float(value) >= 0
    except:
        return False
    
def validate_neighborhood(value):
    if not isinstance(value, float):
        return False
    return isinstance(value, (int, float)) and value >= 0

def validate_year(value):
    try:
        year = int(float(value))
        return 100 <= year <= pd.Timestamp.now().year
    except:
        return False

def validate_numeric(value):
    return isinstance(value, (int, float)) and value >= 0

def validate_bed_bath(value):
    return isinstance(value, (int, float)) and value >= 0

def validate_row(row):
    errors = []

    validations = {
        'Parcel ID': validate_parcel_id,
        'Land Use': validate_land_use,
        'Sale Date': validate_date,
        'Sale Price': validate_price,
        'Legal Reference': validate_legal_reference,
        'Sold As Vacant': validate_sold_as_vacant,
        'Multiple Parcels Involved in Sale': validate_sold_as_vacant,
        'Acreage': validate_acreage,
        'Neighborhood': validate_neighborhood,
        'Land Value': validate_price,
        'Building Value': validate_price,
        'Total Value': validate_price,
        'Finished Area': validate_numeric,
        'Year Built': validate_year,
        'Bedrooms': validate_bed_bath,
        'Full Bath': validate_bed_bath,
        'Half Bath': validate_bed_bath,
    }

    for column, validator in validations.items():
        value = row.get(column)
        if pd.isna(value) and column in non_manadatory_columns:
            continue 
        if not pd.isna(value) and not validator(value):
            errors.append(f"Invalid value in column '{column}': {value}")

    return errors

def validate_dataset(df):
    all_errors = {}
    for index, row in df.iterrows():
        errors = validate_row(row)
        if errors:
            all_errors[index] = errors
    return all_errors

validation_errors = validate_dataset(df)


if validation_errors:
    for row_idx, errors in validation_errors.items():
        print(f"Row {row_idx}:")
        for error in errors:
            print(f"  - {error}")
else:
    print("Validation passed: No errors found.")

Validation passed: No errors found.


In [124]:
def process_data(df):

    # No idea which columns are mandatory 
    mandatory_columns = ['Parcel ID', 'Land Use', 'Sale Date', 'Sale Price', 'Legal Reference']
    df = df.dropna(subset=mandatory_columns)
    
    columns_to_remove = ['image', 'Sold As Vacant', 'Multiple Parcels Involved in Sale']
    df = df.drop(columns=columns_to_remove)


    # Price per square foot
    df['Price per square foot'] = df['Sale Price'] / df['Finished Area']  #Not sure if Sale Price is the right column, might be total value, no idea
    
    # Age of property
    df['Age of property'] = datetime.today().year - df['Year Built']
    
    # Sale year and sale month
    df['Sale Year'] = pd.to_datetime(df['Sale Date']).dt.year
    df['Sale Month'] = pd.to_datetime(df['Sale Date']).dt.month
    
    # Land-to-building value ratio
    df['Land-to-Building Value Ratio'] = df['Land Value'] / df['Building Value']
    
    # Sale price category
    def categorize_sale_price(price):
        if price < 100000:
            return 'Low'
        elif 100000 <= price <= 300000:
            return 'Medium'
        else:
            return 'High'
    
    df['Sale Price Category'] = df['Sale Price'].apply(categorize_sale_price)
    
    # Family Name and First Name of owner
    def extract_name(owner_name):
        if pd.isna(owner_name):
            return np.nan, np.nan
        name_parts = owner_name.split(', ')
        if len(name_parts) == 2:
            return name_parts[0], name_parts[1]
        return owner_name, np.nan 

    df['Family Name'], df['First Name'] = zip(*df['Owner Name'].apply(extract_name))
    
    return df


df_processed = process_data(df)





In [125]:
Writer(df_processed, 'processed_data', 'output')

File saved successfully


<__main__.Writer at 0x2a08ba46990>