## This is a simple example of a Machine Learning program

In [114]:
import pandas as pd
import folium as folium
from folium.plugins import HeatMap
from sklearn.tree import DecisionTreeRegressor

## Read in the data file

In [115]:
df = pd.read_csv(r'C:\Users\emack\Documents\Sandbox\melb_data.csv')

## Print out the first 5 rows to get a quick look at the data format

In [116]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


# Initial thougts about building the model 
From above it looks like we have a price column that will be a useful target for our prediction. Also the features that might be interesting to model are: number of rooms as shown in the 'Rooms' Column and 'Suburb'. Also the 'Week' column indicating number of weeks on the market might be a useful feature. There is no data manual indicating what the Distance and Distance2 columns are but I would guess they are related to schools? Not sure so I don't think i will use them in building my model at this time.

# Get a count of rows and columns

In [32]:
shape = df.shape
print('The number of rows is ', shape[0], 'and the number of columns is ', shape[1])

The number of rows is  13580 and the number of columns is  21


In [68]:
df.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

## Check for NaN's

In [122]:
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

#### It looks like Car, BuildingArea, YearBuilt and CouncilArea are string data types which pPandas reads as NaN's so I will figure out if any of those columns would be helpful to convert to float for using in my model. At first glance I think the bBuildingArea column would be worth using in my model so I will convert it.

# Select my Prediction Target
### By convention the prediction target is called y. Since I already decided I want the price of the houses to be the prediction target I will make that my 'y'

In [123]:
y = df['Price']

# Now let's select my features for building the prediction model.
### By convention, this data is called X and pass a list of dataframe columns to X

In [124]:
model_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = df[model_features]

# Let's get a quick look at some summary statistics on the features for my new model

In [125]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


In [126]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.7996,144.9984
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
3,3,2.0,94.0,-37.7969,144.9969
4,4,1.0,120.0,-37.8072,144.9941


# Visualize the locations of the houses we are modeling 

In [127]:
base_coords = (-37.8136, 144.9631) #the coordinates of our base map

m = folium.Map(location=base_coords, zoom_start=9)

# iterate over rows with iterrowsin order to add the coordinates into the 'data' list for use in our heatmap layer
for index, row in df.iterrows():
     # access data using column names
    foo = (row['Lattitude'], row['Longtitude'])
    data.append(foo)

HeatMap(data, radius=12).add_to(m)    
    
m

# Now let's actually build my prediction model

In [128]:
# Define model. Specify a number for random_state to ensure same results each run
housing_price_model = DecisionTreeRegressor(random_state=1)

# Fit model using the X and y criteria I defined earlier
housing_price_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

# Print the top 5 predicions and see what they look like.

In [130]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(housing_price_model.predict(X.head()))



Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
0      2       1.0     202.0   -37.7996    144.9984
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
3      3       2.0      94.0   -37.7969    144.9969
4      4       1.0     120.0   -37.8072    144.9941
The predictions are
[1480000. 1035000. 1465000.  850000. 1600000.]


# Let's map where those predictions are.