In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
alldata = pd.read_csv('../data/Redfin/redfin_2023-03-19-11-44-03.csv')
alldata.head()

In [None]:
alldata.dtypes

In [None]:
alldata.drop(['SOLD DATE','STATE OR PROVINCE', 'URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)'],axis = 1, inplace=True)
alldata.dtypes

In [None]:
alldata.shape

In [None]:
#Coordinates of Lake Forest College
home = (42.248803, -87.825757)


In [None]:
fig = plt.figure(figsize=[16,8])

ax1 = fig.add_subplot(1,3,1)
ax1.scatter(alldata.LONGITUDE, alldata.LATITUDE, c = alldata['PRICE'], cmap='hot', alpha = 0.8)
ax1.scatter(home[1], home[0], marker = '*', color='blue', s = 200)
ax1.title.set_text("Home Prices by Location")

ax2 = fig.add_subplot(1,3,2)
ax2.scatter(alldata.LONGITUDE, alldata.LATITUDE, c = alldata['SQUARE FEET'], cmap='hot', alpha = 0.8)
ax2.scatter(home[1], home[0], marker = '*', color='blue', s = 200)
ax2.title.set_text("Home Areas by Location")

ax3 = fig.add_subplot(1,3,3)
ax3.scatter(alldata.LONGITUDE, alldata.LATITUDE, c = alldata['$/SQUARE FEET'], cmap='hot', alpha = 0.8)
ax3.scatter(home[1], home[0], marker = '*', color='blue', s = 200)
ax3.title.set_text("Home Price/Sq Feet by Location")


plt.show()

In [None]:
fig = plt.figure(figsize=[8,16])

plt.scatter(alldata.LONGITUDE, alldata.LATITUDE, marker = '^', c = alldata['PRICE'], s = (alldata['SQUARE FEET']*0.3), alpha = 0.4, cmap='hot')
plt.scatter(home[1], home[0], marker = '^', color='blue', alpha = 0.4, s=(1100*0.3))
# plt.title.set_text("Home Prices and Areas by Location")
plt.show()


In [None]:
import seaborn as sb

fig = plt.figure(figsize=[10,8])

sb.heatmap(alldata.corr(), annot=True, fmt='0.1f', cmap = 'jet')

plt.show()



In [None]:
# To calculate geographic distance
from geopy import distance

dists = []
for coords in zip(alldata.LATITUDE, alldata.LONGITUDE):
    dists.append(distance.distance(home, coords).miles)

dists = np.array(dists)
idx = np.argsort(dists)
dists[idx]

plt.plot(dists[idx])
plt.show()

In [None]:
alldata['DISTANCE'] = dists

In [None]:
fig = plt.figure(figsize=[16,8])

plt.scatter(alldata['DISTANCE'], alldata['PRICE'], c = alldata['ZIP OR POSTAL CODE'], cmap = 'nipy_spectral' )

plt.colorbar(ticks=alldata['ZIP OR POSTAL CODE'].unique())

plt.show()

In [None]:
zipdata = alldata['ZIP OR POSTAL CODE']

uniquelabels = zipdata.unique()
cleanup_nums = dict(zip(uniquelabels,range(len(uniquelabels))))
zipdata = zipdata.replace(cleanup_nums, inplace=False)

fig = plt.figure(figsize=[16,8])

plt.scatter(alldata['DISTANCE'], alldata['PRICE'], c = zipdata, cmap = 'nipy_spectral' )

plt.colorbar(ticks=zipdata)

plt.show()

In [None]:
alldata['PROPERTY TYPE'].unique()

In [None]:
plt.hist(alldata.PRICE)
plt.show()

In [None]:
plt.hist(alldata['SQUARE FEET'])
plt.show()

In [None]:
plt.hist(alldata['$/SQUARE FEET'])
plt.show()

In [None]:
alldata.isna().sum()

In [None]:
alldata['HOA/MONTH'] = alldata['HOA/MONTH'].fillna(0)
alldata.drop(['LOT SIZE','MLS#','FAVORITE','INTERESTED'],axis=1,inplace=True)


In [None]:
alldata = alldata.dropna(subset=['SQUARE FEET'])
alldata = alldata.dropna(subset=['YEAR BUILT'])

alldata.isna().sum()

In [None]:
alldata = alldata.reset_index()
alldata.drop(['index'], axis = 1, inplace=True)
alldata.head()

In [None]:
X = alldata[['LONGITUDE','LATITUDE','SQUARE FEET','YEAR BUILT']]
X.head()
y = alldata.PRICE
X = np.array(X)
y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split

X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions using the testing set
y_pred = model.predict(X_test)

# The coefficients
print('Coefficients: \n', model.coef_)

# The mean squared error
print("Root mean squared error: %.2f" % mean_squared_error(y_test, y_pred, squared=False))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

# Plot outputs
fig=plt.figure(figsize=(16, 8))
plt.scatter(range(len(y_test)), y_test, color='green')
plt.scatter(range(len(y_pred)), y_pred, color='red')
plt.show()