In [26]:
#Import Libraries
import arcpy 
import pandas as pd
import numpy as np
import os

In [27]:
arcpy.env.workspace = r'G:\Project2_dataset.gdb\Project2_dataset.gdb'
arcpy.ListFeatureClasses()

['kc_house_data']

## EDA

In [3]:
[f.name for f in arcpy.ListFields('kc_house_data')] #Columns

['OBJECTID', 'Shape', 'id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

In [28]:
[(f.name, f.type) for f in arcpy.ListFields('kc_house_data')]

[('OBJECTID', 'OID'), ('Shape', 'Geometry'), ('id', 'Double'), ('date', 'String'), ('price', 'Double'), ('bedrooms', 'Integer'), ('bathrooms', 'Double'), ('sqft_living', 'Integer'), ('sqft_lot', 'Integer'), ('floors', 'Double'), ('waterfront', 'Integer'), ('view', 'Integer'), ('condition', 'Integer'), ('grade', 'Integer'), ('sqft_above', 'Integer'), ('sqft_basement', 'Integer'), ('yr_built', 'Integer'), ('yr_renovated', 'Integer'), ('zipcode', 'Integer'), ('lat', 'Double'), ('long', 'Double'), ('sqft_living15', 'Integer'), ('sqft_lot15', 'Integer')]

## Spatial Autocorrelation Metrics

In [4]:
arcpy.stats.SpatialAutocorrelation(Input_Feature_Class = 'kc_house_data', Input_Field = 'price',  
                                   Conceptualization_of_Spatial_Relationships = 'INVERSE_DISTANCE',
                                   Distance_Method = 'EUCLIDEAN_DISTANCE', Standardization = 'NONE') #k-means

## Association and Relationship Analysis

In [28]:
df = pd.DataFrame(data = [row for row in arcpy.da.SearchCursor('kc_house_data', '*')], #Vectorize Data
                   columns = [f.name for f in arcpy.ListFields('kc_house_data')])

In [30]:
q2 = df.corr()[['price']].sort_values(by='price', ascending=False).iloc[1:]
q2['price'] = q2['price'].apply(lambda i: i**2) #Compute R2
q2

Unnamed: 0,price
sqft_living,0.492853
grade,0.445468
sqft_above,0.366712
sqft_living15,0.342668
bathrooms,0.275769
view,0.157842
sqft_basement,0.104857
bedrooms,0.095079
lat,0.094251
waterfront,0.070953


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 9))
sns.heatmap(df.corr(), cmap='coolwarm', annot=True);

## Multicolinearity Asessment

In [6]:
#q3
df.corr()[list(q2.index[:8])].loc[list(q2.index[:8])]

Unnamed: 0,sqft_living,grade,sqft_above,sqft_living15,bathrooms,view,sqft_basement,bedrooms
sqft_living,1.0,0.762704,0.876597,0.75642,0.754665,0.284611,0.435043,0.576671
grade,0.762704,1.0,0.755923,0.713202,0.664983,0.251321,0.168392,0.356967
sqft_above,0.876597,0.755923,1.0,0.73187,0.685342,0.167649,-0.051943,0.4776
sqft_living15,0.75642,0.713202,0.73187,1.0,0.568634,0.280439,0.200355,0.391638
bathrooms,0.754665,0.664983,0.685342,0.568634,1.0,0.187737,0.28377,0.515884
view,0.284611,0.251321,0.167649,0.280439,0.187737,1.0,0.276947,0.079532
sqft_basement,0.435043,0.168392,-0.051943,0.200355,0.28377,0.276947,1.0,0.303093
bedrooms,0.576671,0.356967,0.4776,0.391638,0.515884,0.079532,0.303093,1.0


In [10]:
# df.to_csv('G:\df.csv')

In [None]:
fig = plt.figure(figsize=(7,9))
ax = plt.axes(projection = '3d') #3D scatter plot

# sampled_df = df.sample(n=2000) #Sample df for Viz
df = df.sample(n = 2000)
ax.scatter(df['sqft_living'], df['sqft_above'], df['sqft_living15'])

ax.set_xlabel('sqft_living')
ax.set_ylabel('sqft_above')
ax.set_zlabel('sqft_living15')
ax.set_title('3D Scatter Housing Properties SQFT')
plt.show()

In [None]:
fig = plt.figure(figsize=(7,9))
ax = plt.axes(projection = '3d') #3D scatter plot

# sampled_df = df.sample(n=2000) #Sample df for Viz
df = df.sample(n = 2000)
ax.scatter(df['grade'], df['sqft_above'], df['sqft_living15'])

ax.set_xlabel('grade')
ax.set_ylabel('sqft_above')
ax.set_zlabel('sqft_living15')
ax.set_title('3D Scatter Housing Properties SQFT V. Grade')
plt.show()

## Q4 - Strategic Regional Analysis by high price cluster

## Generalized Linear Regression

In [8]:
arcpy.stats.GeneralizedLinearRegression(
    in_features="kc_house_data",
    dependent_variable="price",
    model_type="CONTINUOUS",
    output_features=r"G:\Proj_2\Proj_2.gdb\y_pred_glr",
    explanatory_variables=';'.join([x for x in list(q2.index[:8]) if x not in ['sqft_above', 'sqft_living15', 'grade',
                                                                               'sqft_basement']])
) 

## Geographic Weighted Regression

In [89]:
arcpy.stats.GeographicallyWeightedRegression(in_features = 'kc_house_data', dependent_field = 'price',
        explanatory_field = ';'.join([x for x in list(q2.index[:8]) if x not in ['sqft_above', 'sqft_living15', 'grade',
                                                                               'sqft_basement']]), 
                                     out_featureclass = r"G:\Proj_2\Proj_2.gdb\y_pred_gwr", 
                                     kernel_type = 'FIXED')

In [10]:
def r2_score(actual, predicted):
    mean_actual = np.mean(actual)
    tss = np.sum((actual - mean_actual) ** 2)
    rss = np.sum((actual - predicted) ** 2)
    r2 = 1 - (rss / tss)
    return r2

In [17]:
df_gwr = pd.DataFrame(data = [row for row in arcpy.da.SearchCursor('y_pred_gwr', '*')], #Vectorize Data
                   columns = [f.name for f in arcpy.ListFields('y_pred_gwr')])

#R2 Score
print('R2 Score GWR:', round(r2_score(df_gwr['Observed'], df_gwr['Predicted']),3))


R2 Score GWR: 0.815


In [8]:
#Get Feature Class Data
result_table = pd.DataFrame([row for row in arcpy.da.SearchCursor('y_pred_gwr', '*')], 
                            columns = [f.name for f in arcpy.ListFields('y_pred_gwr')])
result_table.isnull().sum()

OBJECTID                0
Shape                   0
Observed                0
Cond                    0
LocalR2                 0
Predicted               0
Intercept               0
C1_sqft_living          0
C2_bathrooms            0
C3_view                 0
C4_bedrooms             0
Residual                0
StdError                0
StdErr_Intercept        0
StdErrC1_sqft_living    0
StdErrC2_bathrooms      0
StdErrC3_view           0
StdErrC4_bedrooms       0
StdResid                0
Source_ID               0
dtype: int64

In [9]:
result_table.head(3)

Unnamed: 0,OBJECTID,Shape,Observed,Cond,LocalR2,Predicted,Intercept,C1_sqft_living,C2_bathrooms,C3_view,C4_bedrooms,Residual,StdError,StdErr_Intercept,StdErrC1_sqft_living,StdErrC2_bathrooms,StdErrC3_view,StdErrC4_bedrooms,StdResid,Source_ID
0,1,"(-122.25699999999995, 47.51120000000003)",221900.0,10.979895,0.611787,217250.86451,47356.384562,311.03327,-652.182177,82925.620228,-65490.865478,4649.13549,158302.19273,16998.367457,9.997236,9705.977684,5957.793017,6565.169753,0.029369,1
1,2,"(-122.31899999999996, 47.72100000000006)",538000.0,12.095258,0.437392,555445.441888,202377.905696,142.992397,612.765208,28443.068654,-5267.21545,-17445.441888,157926.417289,17624.188627,13.928461,10031.344803,14820.377358,7571.151716,-0.110466,2
2,3,"(-122.23299999999995, 47.73790000000008)",180000.0,14.905618,0.635184,230065.617481,127971.943352,218.663092,39534.327726,163117.419142,-52905.617244,-50065.617481,158017.221207,24360.184559,10.952633,11549.502041,6180.181093,7513.748494,-0.316836,3


## Random Forest Regression

In [37]:
#df.corr()[['price']].sort_values(by='price', ascending=False)

In [36]:
import arcpy

arcpy.stats.Forest(
    prediction_type="PREDICT_FEATURES",
    in_features="kc_house_data",
    variable_predict="price",
    treat_variable_as_categorical=None,
    explanatory_variables="bedrooms false;bathrooms false;sqft_living false;view false;grade true;sqft_above false;sqft_living15 false;sqft_basement false;waterfront true;floors false;",
    distance_features=None,
    explanatory_rasters=None,
    features_to_predict="kc_house_data",
    output_features=r"G:\Proj_2\Proj_2.gdb\y_pred_rf",
    output_raster=None,
    explanatory_variable_matching="bedrooms bedrooms;bathrooms bathrooms;sqft_living sqft_living;view view;grade grade;sqft_above sqft_above;sqft_living15 sqft_living15;sqft_basement sqft_basement;waterfront waterfront;floors floors;",
    explanatory_distance_matching=None,
    explanatory_rasters_matching=None,
    output_trained_features=None,
    output_importance_table=r"G:\Proj_2\Proj_2.gdb\rf_var_importance",
    use_raster_values="TRUE",
    number_of_trees=100,
    minimum_leaf_size=None,
    maximum_depth=None,
    sample_size=100,
    random_variables=None,
    percentage_for_training=20,
    output_classification_table=None,
    output_validation_table=None,
    compensate_sparse_categories="FALSE",
    number_validation_runs=1,
    calculate_uncertainty="FALSE",
    output_trained_model=None
)

In [19]:
df_rf = pd.DataFrame(data = [row for row in arcpy.da.SearchCursor('y_pred_rf', '*')], #Vectorize Data
                   columns = [f.name for f in arcpy.ListFields('y_pred_rf')])
df_rf.head(2)

Unnamed: 0,OBJECTID,Shape,SOURCE_ID,BEDROOMS,BATHROOMS,SQFT_LIVING,VIEW,PREDICTED
0,1,"(-122.25699999999995, 47.51120000000003)",1,3,1.0,1180,0,374207.8191
1,2,"(-122.31899999999996, 47.72100000000006)",2,3,2.25,2570,0,568534.043096


## Adjust Test Size

In [35]:
import arcpy

arcpy.stats.Forest(
    prediction_type="PREDICT_FEATURES",
    in_features="kc_house_data",
    variable_predict="price",
    treat_variable_as_categorical=None,
    explanatory_variables="bedrooms false;bathrooms false;sqft_living false;view false;grade true;sqft_above false;sqft_living15 false;sqft_basement false;waterfront true;floors false;",
    distance_features=None,
    explanatory_rasters=None,
    features_to_predict="kc_house_data",
    output_features=r"G:\Proj_2\Proj_2.gdb\y_pred_rf_10_TEST",
    output_raster=None,
    explanatory_variable_matching="bedrooms bedrooms;bathrooms bathrooms;sqft_living sqft_living;view view;grade grade;sqft_above sqft_above;sqft_living15 sqft_living15;sqft_basement sqft_basement;waterfront waterfront;floors floors;",
    explanatory_distance_matching=None,
    explanatory_rasters_matching=None,
    output_trained_features=None,
    output_importance_table=r"G:\Proj_2\Proj_2.gdb\rf_var_importance",
    use_raster_values="TRUE",
    number_of_trees=100,
    minimum_leaf_size=None,
    maximum_depth=None,
    sample_size=100,
    random_variables=None,
    percentage_for_training=10,
    output_classification_table=None,
    output_validation_table=None,
    compensate_sparse_categories="FALSE",
    number_validation_runs=1,
    calculate_uncertainty="FALSE",
    output_trained_model=None
)

## Generalization Via Pruning

In [11]:
max_depth = 20
min_leaf = 40
arcpy.stats.Forest(
    prediction_type="PREDICT_FEATURES",
    in_features="kc_house_data",
    variable_predict="price",
    treat_variable_as_categorical=None,
    explanatory_variables="bedrooms false;bathrooms false;sqft_living false;view false;grade true;sqft_above false;sqft_living15 false;sqft_basement false;waterfront true;floors false;",
    distance_features=None,
    explanatory_rasters=None,
    features_to_predict="kc_house_data",
    output_features=f"G:\Proj_2\Proj_2.gdb\y_pred_rf_depth_{max_depth}",
    output_raster=None,
    explanatory_variable_matching="bedrooms bedrooms;bathrooms bathrooms;sqft_living sqft_living;view view;grade grade;sqft_above sqft_above;sqft_living15 sqft_living15;sqft_basement sqft_basement;waterfront waterfront;floors floors;",
    explanatory_distance_matching=None,
    explanatory_rasters_matching=None,
    output_trained_features=None,
    output_importance_table= r"G:\Proj_2\Proj_2.gdb\rf_var_importance",
    use_raster_values="TRUE",
    number_of_trees=100,
    minimum_leaf_size=min_leaf,
    maximum_depth=max_depth,
    sample_size=100,
    random_variables=None,
    percentage_for_training=20,
    output_classification_table=None,
    output_validation_table=None,
    compensate_sparse_categories="FALSE",
    number_validation_runs=1,
    calculate_uncertainty="FALSE",
    output_trained_model=None
)

In [None]:
df_rf = pd.DataFrame(data = [row for row in arcpy.da.SearchCursor('y_pred_rf', '*')], #Vectorize Data
                   columns = [f.name for f in arcpy.ListFields('y_pred_rf')])
df_rf.head(2)

### Cluster Association Analysis (High-Price)

In [1]:
arcpy.stats.ClustersOutliers(
    Input_Feature_Class="kc_house_data",
    Input_Field="price",
    Output_Feature_Class=r"G:\Proj_2\Proj_2.gdb\kc_house_data_ClustersOutliers",
    Conceptualization_of_Spatial_Relationships="INVERSE_DISTANCE",
    Distance_Method="EUCLIDEAN_DISTANCE",
    Standardization="ROW",
    Distance_Band_or_Threshold_Distance=None,
    Weights_Matrix_File=None,
    Apply_False_Discovery_Rate__FDR__Correction="NO_FDR",
    Number_of_Permutations=499,
    number_of_neighbors=None
)

In [None]:
arcpy.conversion.ExportFeatures(
    in_features="kc_house_data",
    out_features=r"G:\Proj_2\Proj_2.gdb\high_price_clusters",
    where_clause="",
    use_field_alias_as_name="NOT_USE_ALIAS",
    field_mapping='id "id" true true false 8 Double 0 0,First,#,kc_house_data,id,-1,-1;date "date" true true false 8000 Text 0 0,First,#,kc_house_data,date,0,8000;price "price" true true false 8 Double 0 0,First,#,kc_house_data,price,-1,-1;bedrooms "bedrooms" true true false 4 Long 0 0,First,#,kc_house_data,bedrooms,-1,-1;bathrooms "bathrooms" true true false 8 Double 0 0,First,#,kc_house_data,bathrooms,-1,-1;sqft_living "sqft_living" true true false 4 Long 0 0,First,#,kc_house_data,sqft_living,-1,-1;sqft_lot "sqft_lot" true true false 4 Long 0 0,First,#,kc_house_data,sqft_lot,-1,-1;floors "floors" true true false 8 Double 0 0,First,#,kc_house_data,floors,-1,-1;waterfront "waterfront" true true false 4 Long 0 0,First,#,kc_house_data,waterfront,-1,-1;view "view" true true false 4 Long 0 0,First,#,kc_house_data,view,-1,-1;condition "condition" true true false 4 Long 0 0,First,#,kc_house_data,condition,-1,-1;grade "grade" true true false 4 Long 0 0,First,#,kc_house_data,grade,-1,-1;sqft_above "sqft_above" true true false 4 Long 0 0,First,#,kc_house_data,sqft_above,-1,-1;sqft_basement "sqft_basement" true true false 4 Long 0 0,First,#,kc_house_data,sqft_basement,-1,-1;yr_built "yr_built" true true false 4 Long 0 0,First,#,kc_house_data,yr_built,-1,-1;yr_renovated "yr_renovated" true true false 4 Long 0 0,First,#,kc_house_data,yr_renovated,-1,-1;zipcode "zipcode" true true false 4 Long 0 0,First,#,kc_house_data,zipcode,-1,-1;lat "lat" true true false 8 Double 0 0,First,#,kc_house_data,lat,-1,-1;long "long" true true false 8 Double 0 0,First,#,kc_house_data,long,-1,-1;sqft_living15 "sqft_living15" true true false 4 Long 0 0,First,#,kc_house_data,sqft_living15,-1,-1;sqft_lot15 "sqft_lot15" true true false 4 Long 0 0,First,#,kc_house_data,sqft_lot15,-1,-1',
    sort_field=None
)

In [31]:
#Initialize DF
arcpy.env.workspace = r'G:\Proj_2\Proj_2.gdb'

df_high = pd.DataFrame(data = [row for row in arcpy.da.SearchCursor('high_price_clusters', '*')],
                   columns = [f.name for f in arcpy.ListFields('high_price_clusters')])
df_high.head(2)

Unnamed: 0,OBJECTID,Shape,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1,"(-122.23299999999995, 47.531600000000026)",2524049000.0,20140826T000000,2000000.0,3,2.75,3050,44867,1.0,0,4,3,9,2330,720,1968,0,98040,47.5316,-122.233,4110,20336
1,2,"(-122.19199999999995, 47.58150000000006)",3394100000.0,20140909T000000,975000.0,4,2.5,2720,11049,2.0,0,0,3,10,2720,0,1989,0,98004,47.5815,-122.192,2750,11049


In [32]:
#Coefficient Determination
r2_high = df_high.corr()[['price']].sort_values(by='price', ascending=False).iloc[1:]
r2_high['price'] = r2_high['price'].apply(lambda i: i**2) #Compute R2
r2_high

Unnamed: 0,price
sqft_living,0.659822
sqft_above,0.555179
grade,0.46304
bathrooms,0.452633
sqft_living15,0.30105
view,0.184625
bedrooms,0.141871
waterfront,0.140096
sqft_basement,0.104526
yr_built,0.098913
