In [13]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [14]:
# data preparation, enter in our csv file, or json if we go that route
real_estate_df = pd.read_csv(Path('/Volumes/EXTRA/DU_Projects/Final Project/Real_Estate_Final_Project/Resources/Resources/Clean_Real_Estate.csv'))
real_estate_df.head()

Unnamed: 0,price,bed,bath,acre_lot,full_address,city,state,zip_code,house_size,TJs_store
0,180000.0,2.0,1.0,0.34,"23 Moore St, Agawam, MA, 01001",Agawam,Massachusetts,1001,676.0,Yes
1,239900.0,3.0,1.0,0.46,"270 South St, Agawam, MA, 01001",Agawam,Massachusetts,1001,1196.0,Yes
2,525000.0,3.0,3.0,0.45,"955 River Rd, Agawam, MA, 01001",Agawam,Massachusetts,1001,2314.0,Yes
3,289900.0,3.0,2.0,0.36,"82 Harvey Johnson Dr, Agawam, MA, 01001",Agawam,Massachusetts,1001,1276.0,Yes
4,275000.0,4.0,2.0,0.11,"6-8 King Ave, Agawam, MA, 01001",Agawam,Massachusetts,1001,1732.0,Yes


In [17]:
#Add Walmart and Trader Joes Data as dataframe
trader_joes_df = pd.read_csv(Path('/Volumes/EXTRA/DU_Projects/Final Project/Real_Estate_Final_Project/Resources/Resources/Clean_TJs.csv'))
walmart_df = pd.read_csv(Path('/Volumes/EXTRA/DU_Projects/Final Project/Real_Estate_Final_Project/Resources/Resources/Clean_Walmart.csv'))

In [18]:
# Find whether the zip code of the address has a Trader Joe's 

# Insert TJs binary column in df
real_estate_df["TJs_store"] = ""

# Create empty list for TJ stores
zip_code_matching = []

# For loop to check matching TJs zip code
for zip_code in range(len(real_estate_df)):
    
    if zip_code in trader_joes_df['zip']:
        zip_code_matching.append('Yes')
    else:
        zip_code_matching.append('No')
       
real_estate_df["TJs_store"] = zip_code_matching

In [19]:
real_estate_df["TJs_store"] = real_estate_df["TJs_store"].map({'Yes': 1, 'No': 0})
real_estate_df.head()

Unnamed: 0,price,bed,bath,acre_lot,full_address,city,state,zip_code,house_size,TJs_store
0,180000.0,2.0,1.0,0.34,"23 Moore St, Agawam, MA, 01001",Agawam,Massachusetts,1001,676.0,1
1,239900.0,3.0,1.0,0.46,"270 South St, Agawam, MA, 01001",Agawam,Massachusetts,1001,1196.0,1
2,525000.0,3.0,3.0,0.45,"955 River Rd, Agawam, MA, 01001",Agawam,Massachusetts,1001,2314.0,1
3,289900.0,3.0,2.0,0.36,"82 Harvey Johnson Dr, Agawam, MA, 01001",Agawam,Massachusetts,1001,1276.0,1
4,275000.0,4.0,2.0,0.11,"6-8 King Ave, Agawam, MA, 01001",Agawam,Massachusetts,1001,1732.0,1


In [20]:
# Find whether the zip code of the address has a Walmart

# Insert TJs binary column in df
real_estate_df["Walmart_store"] = ""

# Create empty list for TJ stores
walmart_zip_code_matching = []

# For loop to check matching TJs zip code
for zip_code in range(len(real_estate_df)):
    
    if zip_code in walmart_df['zip_code']:
        walmart_zip_code_matching.append('Yes')
    else:
        walmart_zip_code_matching.append('No')
       
real_estate_df["Walmart_store"] = walmart_zip_code_matching

real_estate_df["Walmart_store"].value_counts()

No     363147
Yes        98
Name: Walmart_store, dtype: int64

In [21]:
real_estate_df["Walmart_store"] = real_estate_df["Walmart_store"].map({'Yes': 1, 'No': 0})
real_estate_df.head()

Unnamed: 0,price,bed,bath,acre_lot,full_address,city,state,zip_code,house_size,TJs_store,Walmart_store
0,180000.0,2.0,1.0,0.34,"23 Moore St, Agawam, MA, 01001",Agawam,Massachusetts,1001,676.0,1,1
1,239900.0,3.0,1.0,0.46,"270 South St, Agawam, MA, 01001",Agawam,Massachusetts,1001,1196.0,1,1
2,525000.0,3.0,3.0,0.45,"955 River Rd, Agawam, MA, 01001",Agawam,Massachusetts,1001,2314.0,1,1
3,289900.0,3.0,2.0,0.36,"82 Harvey Johnson Dr, Agawam, MA, 01001",Agawam,Massachusetts,1001,1276.0,1,1
4,275000.0,4.0,2.0,0.11,"6-8 King Ave, Agawam, MA, 01001",Agawam,Massachusetts,1001,1732.0,1,1


In [22]:
real_estate_df.corr()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size,TJs_store,Walmart_store
price,1.0,0.129162,0.363124,-0.004872,-0.006481,0.222116,-0.002364,-0.004444
bed,0.129162,1.0,0.46375,-0.007774,-0.03753,0.40893,0.000304,-0.001116
bath,0.363124,0.46375,1.0,-0.005246,0.007152,0.470918,-0.000374,-5.5e-05
acre_lot,-0.004872,-0.007774,-0.005246,1.0,0.006531,-0.003931,-0.000119,-0.000193
zip_code,-0.006481,-0.03753,0.007152,0.006531,1.0,-0.052215,-0.01159,-0.020318
house_size,0.222116,0.40893,0.470918,-0.003931,-0.052215,1.0,0.000123,-5.6e-05
TJs_store,-0.002364,0.000304,-0.000374,-0.000119,-0.01159,0.000123,1.0,0.571377
Walmart_store,-0.004444,-0.001116,-5.5e-05,-0.000193,-0.020318,-5.6e-05,0.571377,1.0


Begin the machine learning model with Extra Trees Regressor--Using the binary values for the Trader Joes and Walmart as features

In [23]:
#Define the values to be used in the model
real_estate_model=real_estate_df[["price","bed","bath","house_size","TJs_store","Walmart_store"]]

In [24]:
#define X
import numpy as np
X=real_estate_df[["bed","bath","acre_lot","house_size","TJs_store","Walmart_store"]]
X


Unnamed: 0,bed,bath,acre_lot,house_size,TJs_store,Walmart_store
0,2.0,1.0,0.34,676.0,1,1
1,3.0,1.0,0.46,1196.0,1,1
2,3.0,3.0,0.45,2314.0,1,1
3,3.0,2.0,0.36,1276.0,1,1
4,4.0,2.0,0.11,1732.0,1,1
...,...,...,...,...,...,...
363240,5.0,5.0,1.60,4522.0,0,0
363241,1.0,2.0,0.99,1052.0,0,0
363242,4.0,2.0,0.40,1650.0,0,0
363243,4.0,2.0,0.18,2123.0,0,0


In [25]:
#Define y predictor variable
y=real_estate_df['price']

In [26]:
#split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [27]:
#define the model
real_estate_model=ExtraTreesRegressor(n_estimators=128, random_state=78)

In [28]:
#Fit the training data to the model
real_estate_model=real_estate_model.fit(X_train, y_train)

In [29]:
#get the accuracy score of the model
real_estate_model.score(X_test, y_test)

0.9512652958754401

In [30]:
# Calculate predictions

y_pred = real_estate_model.predict(X_test)
y_pred

array([799000., 679900., 305960., ..., 204900., 705700., 699900.])

In [34]:
import joblib

joblib.dump(real_estate_model, 'real_estate.pkl')

['real_estate.pkl']

In [None]:
#print price pediction based on first set of features
print(y_pred[0])

369000.0


In [None]:
pd.DataFrame({"features":X_train.columns,"importances":real_estate_model.feature_importances_}).sort_values(by="importances",ascending=False)

Unnamed: 0,features,importances
3,house_size,0.5357059
2,acre_lot,0.2963613
1,bath,0.1265145
0,bed,0.04140142
5,Walmart_store,1.610049e-05
4,TJs_store,7.262041e-07


Prediction Function to be used on HTML

In [None]:

# prediction function
def ValuePredictor(to_predict_list):
    to_predict = np.array(to_predict_list).reshape(1, 12)
    loaded_model = pickle.load(open("model.pkl", "rb"))
    result = loaded_model.predict(to_predict)
    return result[0]
 
@app.route('/result', methods = ['POST'])
def result():
    if request.method == 'POST':
        to_predict_list = request.form.to_dict()
        to_predict_list = list(to_predict_list.values())
        to_predict_list = list(map(int, to_predict_list))
        result = ValuePredictor(to_predict_list)       
        if int(result)== 1:
            prediction ='Income more than 50K'
        else:
            prediction ='Income less that 50K'           
        return render_template("result.html", prediction = prediction)