# Test notebook

In [1]:
import pandas as pd
import numpy as np
import os
import zipfile
from zipfile import ZipFile as zf
import sqlite3

In [2]:
def unzip_files(src, dst, file_list):
    for root, dirs, files in os.walk(src):
        for name in files:
            # file_name = name.split('.')[0]
            file_type = name.split(".")[-1]
            if file_type == "zip":
                file = str(os.path.join(root, name))

                with zf(file, "r") as zFile:
                    all_files = zFile.namelist()
                    files_to_unzip = [f for f in all_files if f in file_list]

                    if not files_to_unzip:
                        print(f"No files in {file}")

                    for file in files_to_unzip:
                        print(f"Unzipping {file}")
                        zFile.extract(file, f"{dst}/")

In [3]:
def load_tables_to_dataframe(file_name) -> pd.DataFrame | pd.Series:
    encoder_dict = {
        "building_res.txt": "Windows - 1252",
        "exterior.txt": "ascii",
        "extra_features.txt": "ascii",
        "fixtures.txt": "ascii",
        "land.txt": "ascii",
        "real_neighborhood_code.txt": "ascii",
        "real_acct.txt": "Windows - 1252",
        "parcels.csv": "utf-8",
        "extra_features_detail1.txt": "utf-8",
        "kaggle_dataset.csv": "utf-8",
    }

    encoder = encoder_dict[file_name]

    try:
        print(f"Reading {file_name} into dataframe...")
        if file_name == "parcels.csv":
            # Exception for the parcel file exported from QGIS
            df = pd.read_csv(f"Data/{file_name}", low_memory=False)
            df = df[["HCAD_NUM", "latitude", "longitude"]]
        else:
            df = pd.read_csv(
                f"Data/{file_name}", sep="\t", encoding=encoder, low_memory=False
            )

        # Strip extra spaces on all object column types
        df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    except Exception as e:
        print(f"{file_name} was not read by pandas. See exception:\n {e}")
    return df

In [4]:
file_list = [
    "real_neighborhood_code.txt",
    "building_res.txt",
    "real_acct.txt",
    "land.txt",
    "fixtures.txt",
    "extra_features.txt",
    "exterior.txt",
    "extra_features_detail1.txt",
    "parcels.csv",
]
# Extract files
unzip_files(src="Zips", dst="Data", file_list=file_list)

Unzipping building_res.txt
Unzipping exterior.txt
Unzipping extra_features.txt
Unzipping extra_features_detail1.txt
Unzipping fixtures.txt
Unzipping land.txt
No files in Zips/Code_description_real.zip
Unzipping real_acct.txt
Unzipping real_neighborhood_code.txt
No files in Zips/Parcels.zip


## Building Res file
There is a row for each building on the property. Most have one building, but some have 2, 3, 4, etc. so to make the account number unique so it can be 

In [5]:
building_res = load_tables_to_dataframe("building_res.txt")
desc_mapping = {
    "Poor": 0,
    "Very Low": 1,
    "Low": 2,
    "Average": 3,
    "Good": 4,
    "Excellent": 5,
    "Superior": 6,
}
building_res["dscr"] = building_res["dscr"].replace(desc_mapping)
building_res.head()

Reading building_res.txt into dataframe...


  building_res["dscr"] = building_res["dscr"].replace(desc_mapping)


Unnamed: 0,acct,property_use_cd,bld_num,impr_tp,impr_mdl_cd,structure,structure_dscr,dpr_val,cama_replacement_cost,accrued_depr_pct,...,heat_ar,gross_ar,eff_ar,base_ar,perimeter,pct,bld_adj,rcnld,size_index,lump_sum_adj
0,20720000014,A1,1,1001,101,R,Residential,483261,508696,0.95,...,2534,4264,2845,2534,452,1.0,1.53,315857.0,0.82,24449
1,21440000001,A1,1,1001,101,R,Residential,307658,415754,0.74,...,2537,2803,2523,2537,314,1.0,1.8,170921.0,0.82,15606
2,21440000003,B2,1,1002,102,R,Residential,405310,686966,0.59,...,3660,3800,3514,3660,372,1.0,1.8,225172.0,0.76,28337
3,21440000008,B2,1,1002,102,R,Residential,251911,381684,1.0,...,3056,3696,3104,3056,346,0.66,1.8,139951.0,0.79,18671
4,21480000001,B2,1,1002,102,R,Residential,214936,275559,0.78,...,1962,1962,1962,1962,238,1.0,1.8,119409.0,0.86,14672


In [6]:
building_res.query(
    "property_use_cd == 'A1' and impr_tp == 1001 and date_erected > 1900",
    inplace=True,
)

In [7]:
building_res.groupby("acct").agg(
    bld_num=("bld_num", "max"),
    date_erected=("date_erected", "min"),
    im_sq_ft=("im_sq_ft", "sum"),
    perimeter=("perimeter", "sum"),
    dpr_val=("dpr_val", "sum"),
    dscr=("dscr", "mean"),
)

Unnamed: 0_level_0,bld_num,date_erected,im_sq_ft,perimeter,dpr_val,dscr
acct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20720000014,1,2019,2534,452,483261,4.0
21440000001,1,2004,2537,314,307658,4.0
21480000002,1,1917,1496,186,9621,2.0
21650000007,1,2015,3387,578,587261,4.0
21650000011,1,2003,1508,210,172034,3.0
...,...,...,...,...,...,...
1938003804013,1,2003,2117,318,235142,3.0
1938003804014,1,2003,1611,184,205392,3.0
1953050320690,1,1979,2168,232,201010,3.0
1953080320060,1,1983,1723,216,181811,3.0


# Real Acct file

In [8]:
real_acct = load_tables_to_dataframe("real_acct.txt")
real_acct = real_acct[
    [
        "acct",
        "land_ar",
        "land_val",
        "bld_val",
        "assessed_val",
        "mail_addr_1",
        "mail_addr_2",
        "mail_city",
        "mail_state",
        "mail_zip",
    ]
]
real_acct.head()

Reading real_acct.txt into dataframe...


Unnamed: 0,acct,land_ar,land_val,bld_val,assessed_val,mail_addr_1,mail_addr_2,mail_city,mail_state,mail_zip
0,10010000013,44431,0.0,0.0,0.0,PO BOX 1562,,HOUSTON,TX,77251-1562
1,10020000001,5001,300060.0,10712.0,310772.0,1717 SAINT JAMES PLACE STE 112,,HOUSTON,TX,77056-3412
2,10020000003,18121,860406.0,34279.0,894685.0,2612 TODVILLE RD,,SEABROOK,TX,77586-3008
3,10020000004,9061,430203.0,17139.0,447342.0,3302 SUFFOLK DR,,HOUSTON,TX,77027-6326
4,10020000013,3001,0.0,0.0,0.0,1019 COMMERCE ST STE 200,,HOUSTON,TX,77002-1701


# Fixtures
There are other features on the 

In [9]:
fixtures = load_tables_to_dataframe("fixtures.txt")

Reading fixtures.txt into dataframe...


In [10]:
fixtures.query('acct==1074380000028')

Unnamed: 0,acct,bld_num,type,type_dscr,units
2931290,1074380000028,1,RMH,Room: Half Bath,1.0
2931291,1074380000028,1,RMB,Room: Bedroom,4.0
2931292,1074380000028,1,FXA,Fixtures: Addl,1.0
2931293,1074380000028,1,RMR,Room: Rec,1.0
2931294,1074380000028,1,RMF,Room: Full Bath,2.0
2931295,1074380000028,1,RMT,Room: Total,7.0
2931296,1074380000028,1,FPW,Fireplace: Masonry Firebrick,1.0
2931297,1074380000028,1,STY,Story Height Index,2.0
