# Harris County Home Price Estimations

In [None]:
import sqlite3

import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV


In [None]:
con = sqlite3.connect('HouseProtestValues.db')

sql_query = '''SELECT  br.acct,
                       br.bld_num,
                       br.dscr,
                       br.date_erected,
                       br.yr_remodel,
                       br.im_sq_ft,
                       br.gross_ar,
                       br.base_ar,
                       ra.land_ar,
                       br.perimeter,
                       br.size_index,
                       ra.school_dist,
                       ra.land_val,
                       ra.bld_val,
                       ra.assessed_val,
                       ra.tot_appr_val,
                       ra.tot_mkt_val
            FROM building_res as br
            LEFT JOIN real_acct as ra ON br.acct = ra.acct
            WHERE br.impr_tp = 1001 AND br.property_use_cd = 'A1' AND br.date_erected > 10;'''

building_data = pd.read_sql_query(sql_query, con)

building_data[['acct', 'dscr', 'school_dist']] = building_data[
    ['acct', 'dscr', 'school_dist']].astype('category')

In [None]:
# Story Height Index: STY
# Room: Bedroom: RMB
# Room: Full Bath: RMF
# Room: Half Bath: RMH
# Room: Total: RMT
fixtures_sql = """SELECT *
                  FROM "fixtures"
                  WHERE type IN ('STY', 'RMB','RMF','RMH','RMT')
                """
fixtures = pd.read_sql_query(fixtures_sql, con)
fix_pt = fixtures.pivot_table(index=['acct', 'bld_num'], columns='type', values='units', aggfunc='sum')
fix_pt = fix_pt.reset_index()
fix_pt.fillna(0, inplace=True)

In [None]:
all_df = pd.merge(building_data, fix_pt, on=['acct', 'bld_num'], how='left')
all_df.dropna(inplace=True)

In [None]:
print(f"All Data{all_df.shape}")

In [None]:
x_categorical = ['dscr', 'school_dist']
x_continuous = ['date_erected', 'yr_remodel', 'im_sq_ft',
                'gross_ar', 'base_ar', 'land_ar', 'perimeter', 'size_index',
                'RMB', 'RMF', 'RMH', 'RMT', 'STY']
y = all_df['assessed_val']
print(f"x_categorical{all_df[x_categorical].shape} | x_continuous {all_df[x_continuous].shape} | y {y.shape}")

In [None]:
all_df[x_categorical].head()

In [None]:
# initialize the OneHotEncoder
encoder = OneHotEncoder(drop='first')

#Fit and transform the categorical columns
encoded = encoder.fit_transform(all_df[x_categorical])

df_results = pd.DataFrame.sparse.from_spmatrix(encoded)
df_results.columns = encoder.get_feature_names_out(x_categorical)
df_results.shape

In [None]:
df_results.info()

In [None]:
# Free up memory
building_data = None
fix_pt = None
fixtures = None


In [None]:
# Reset index of both dataframes
all_df.reset_index(drop=True, inplace=True)
df_results.reset_index(drop=True, inplace=True)

# Merge the encoded data and continuous data together
encoded_features = pd.concat([all_df[x_continuous], df_results], join='inner', axis=1)

In [None]:
encoded_features.shape

In [None]:
# sns.histplot(data=all_df, x="assessed_val")

In [None]:
# sns.histplot(data=all_df, x="assessed_val", log_scale=True)

In [None]:
# sns.pairplot(data=all_df, vars=['assessed_val', 'date_erected', 'im_sq_ft', 'school_dist', 'Neighborhood_Code'])

In [None]:
# sns.pairplot(data=all_df, vars=['assessed_val', 'land_ar', 'RMB', 'RMF', 'RMH', 'STY'])

In [None]:
print(f"Encoded features{encoded_features.shape} | All Data{all_df.shape} | Results {df_results.shape} | y {y.shape} ")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(encoded_features, y, test_size=0.4, random_state=42)

In [None]:
model = ExtraTreesRegressor(n_estimators=100, random_state=42)

In [None]:
model_fit = model.fit(x_train, y_train)

In [None]:
cross_val_score(model_fit, x_train, y_train, scoring='accuracy', cv=5, n_jobs=-1).mean()

In [None]:
y_pred = model.predict(x_test)
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")

In [None]:
param_grid = {
    'n_estimators': [100, 250, 500],
    'min_samples_leaf': [5, 15, 25],
    'max_features': [10, 20, 30],
    'criterion': ['gini', 'entropy']
}

In [None]:
etc2 = GridSearchCV(model_fit, param_grid,cv=3, n_jobs=-1)

In [None]:
etc2.fit(x_train,y_train)

In [None]:
etc2.best_params_

In [None]:
etc2.best_score_