# California Housing Price Prediction .

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
%config IPCompleter.greedy=True

In [None]:
ca = pd.read_excel("1553768847_housing.xlsx")

In [None]:
ca.describe()

In [None]:
ca.head()

In [None]:
ca.info()

In [None]:
ca[ca["total_bedrooms"].isnull()].describe()

In [None]:
ca["total_bedrooms"].std()

In [37]:
ca["total_bedrooms"].min()

1.0

In [38]:
ca["total_bedrooms"].max()

6445.0

In [39]:
ca["total_bedrooms"].mean()

537.6163178294573

In [40]:
ca["total_bedrooms"].median()

435.0

In [41]:
ca["total_rooms"].std()

2181.615251582795

In [42]:
ca["total_rooms"].min()

2

In [43]:
ca["total_rooms"].max()

39320

In [44]:
ca["total_rooms"].median()

2127.0

In [45]:
ca["total_rooms"].median()/ca["total_bedrooms"].median()

4.889655172413793

In [46]:
ca["total_bedrooms"] = ca["total_bedrooms"].fillna(0)

In [47]:
ca["total_bedrooms"].count()

20640

In [48]:
total_bedrooms_null = ca.loc[ca["total_bedrooms"].isnull(), "total_rooms"]

In [49]:
print(total_bedrooms_null)

Series([], Name: total_rooms, dtype: int64)


In [50]:
ca["total_bedrooms"].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
20635    False
20636    False
20637    False
20638    False
20639    False
Name: total_bedrooms, Length: 20640, dtype: bool

In [51]:
for i in ca.index:
    if ca['total_bedrooms'][i]==0:
        ca['total_bedrooms'][i] = ca['total_rooms'][i]/5

In [52]:
ca.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null int64
total_rooms           20640 non-null int64
total_bedrooms        20640 non-null float64
population            20640 non-null int64
households            20640 non-null int64
median_income         20640 non-null float64
ocean_proximity       20640 non-null object
median_house_value    20640 non-null int64
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [53]:
ca.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.616318,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,420.792318,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [54]:
y = ca.median_house_value

In [55]:
y

0        452600
1        358500
2        352100
3        341300
4        342200
          ...  
20635     78100
20636     77100
20637     92300
20638     84700
20639     89400
Name: median_house_value, Length: 20640, dtype: int64

In [56]:
X = ca.drop("median_house_value",1)

In [57]:
X.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.616318,1425.476744,499.53968,3.870671
std,2.003532,2.135952,12.585558,2181.615252,420.792318,1132.462122,382.329753,1.899822
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999
25%,-121.8,33.93,18.0,1447.75,295.0,787.0,280.0,2.5634
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001


In [58]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY


In [59]:
X.ocean_proximity.unique() 

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [60]:
X['near_bay'] = X.apply(lambda row: 1 if row.ocean_proximity=="NEAR BAY" else (0), axis = 1)
X['1h_ocean'] = X.apply(lambda row: 1 if row.ocean_proximity=="<1H OCEAN" else (0), axis = 1)
X['inland'] = X.apply(lambda row: 1 if row.ocean_proximity=="INLAND" else (0), axis = 1)
X['island'] = X.apply(lambda row: 1 if row.ocean_proximity=="ISLAND" else (0), axis = 1)
X['near_ocean'] = X.apply(lambda row: 1 if row.ocean_proximity=="NEAR OCEAN" else (0), axis = 1)

In [61]:
X = X.drop("ocean_proximity",1)

In [62]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,near_bay,1h_ocean,inland,island,near_ocean
0,-122.23,37.88,41,880,129.0,322,126,8.3252,1,0,0,0,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,1,0,0,0,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,1,0,0,0,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,1,0,0,0,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,0,0,1,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,0,0,1,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,0,0,1,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,0,0,1,0,0


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [65]:
# Calculate and show correlation matrix
cm = np.corrcoef(X_train.values)
hm = sns.heatmap(cm,
                cbar=True,
                annot=True,
                square=True,
                fmt='.2f',
                annot_kws={'size': 15},
                yticklabels=cols,
                xticklabels=cols)

NameError: name 'cols' is not defined

In [None]:
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

In [None]:
# fit a model
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [None]:
predictions

In [None]:
## The line / model
plt.scatter(y_test, predictions)
plt.xlabel("median_house_value")
plt.ylabel("Predictions")

In [None]:
print ("Score:", model.score(X_test, y_test))

In [None]:
print ("Score:", model.score(X_train, y_train))

In [None]:
y_pred = lm.predict(X_test)

In [None]:
print(metrics.mean_absolute_error(y_test, y_pred)) #MAE

In [None]:
print(metrics.mean_squared_error(y_test, y_pred)) #MSE

In [None]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred))) #RMSE

In [None]:
median_income_train = X_train.median_income
median_income_test = X_test.median_income

In [None]:
#X_train_ME = X_train.drop("median_income",1)
#X_test_ME = X_test.drop("median_income",1)

In [None]:
median_income_train

In [None]:
median_income_test

In [None]:
MEmodel = lm.create(X_train, target='median_house_value', features=['median_income'], validation_set=None)

In [None]:
MEpredictions

In [None]:
## The line / model
plt.bar(y_test, predictions)
plt.xlabel("Median Income")
plt.ylabel("Predictions")