In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

%matplotlib inline


In [3]:
#Read the data
df = pd.read_csv('house_prices.csv')
df.shape

(10424, 255)

In [9]:
#Let's get the median listing price columns
prices = df.columns[6:]

In [10]:
#Let's calculate the average listing price and the variance over the time 
#period
df['average_price'] = np.nanmean(df[prices], axis=1)
df['price_variance'] = np.nanvar(df[prices], axis=1)
df['size'] = df.shape[0] - df['SizeRank']

### Take a look at the data

In [15]:
df[['RegionName', 'State','average_price',]].nlargest(10,'average_price')

Unnamed: 0,RegionName,State,average_price
4430,Atherton,CA,4998383000.0
3349,Hillsborough,CA,4668615000.0
9805,Jupiter Island,FL,4508647000.0
5941,Portola Valley,CA,3574151000.0
4242,Los Altos Hills,CA,2504035000.0
9364,Water Mill,NY,2491381000.0
2915,Malibu,CA,2051730000.0
7837,Belvedere,CA,1890704000.0
5243,Montecito,CA,1755322000.0
3770,Woodside,CA,1538833000.0


In [16]:
df[['RegionName', 'State', 'average_price']].nsmallest(10, 'average_price')

Unnamed: 0,RegionName,State,average_price
2452,Raymondville,TX,58205.414851
6313,Hearne,TX,64285.341778
5292,Falfurrias,TX,67352.561228
2186,Clarksdale,MS,69211.266032
6652,Haskell,TX,71621.563852
10133,Coffeen,IL,71948.214223
6218,Lyford,TX,72802.452007
4166,Marlin,TX,74128.859691
5235,Belzoni,MS,75390.49771
10017,Oakes,ND,75990.016349


#### Let's see how the average price over the time period is related to size of the region and the volatility of the price.

In [None]:
#Let's color code for each region
colors = np.random.rand(df.shape[0])
#the variance by a factor of 10 so the sizes are more managable
sizes = df['price_variance'] / 10

plt.style.use('fivethirtyeight')
#Scatter plot of the data
plt.scatter(df['size'],df['average_price'],s=sizes,c=colors,alpha=0.7)
plt.ylim([0,2500])
plt.xlim([-10,85])
plt.ylabel('Median Listing Price')
plt.xlabel('Size')
labels = df['RegionName']
plt.text(-10,2500,'The size of the dot represents varinace in price for the region',fontsize=10,color='red')
#let's label out plot, dots
top_five_variance = df[['RegionName','size','average_price','price_variance']].nlargest(5,'price_variance')
for r in top_five_variance.itertuples(index=False):
    plt.annotate(r[0],xy=(r[1],r[2]),size=10,xycoords='data',xytext=(r[1]+10,r[2]+20),arrowprops=dict(arrowstyle = '->', color='black'))
plt.show()

