In [57]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [65]:
df = pd.read_csv('housing.csv')
print(df.head())
print(df.describe())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
          longitude      latitude  housing_median_age   total_rooms  \
coun

In [66]:
df = df.dropna()
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [67]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [68]:
# Predict median house pricing 

# Changing the Lat long features to eztablish that LAT LONG are depicting a 3D space
# x = cos(lat) * cos(lon) y = cos(lat) * sin(lon),  z = sin(lat) 
x = []
y = []
z = []
for lat,lon in zip (df["latitude"], df["longitude"]):
    x.append(np.cos(lat))
    y.append(np.cos(lat) * np.sin(lon))
    z.append(np.sin(lat))
df = df.drop(labels=["latitude","longitude"], axis=1)
df["x"]=x
df["y"]=y
df["z"]=z


In [69]:
df = pd.get_dummies(df, prefix=["ocean_proximity"],  drop_first=True)

In [None]:
plt.scatter(df["total_rooms"],df["median_house_value"] )

In [None]:
sns.pairplot(df)

In [None]:
plt.scatter(df["total_rooms"], df["total_bedrooms"])

In [70]:
# Implies that total rooms and total_bedrooms are correlated 
df = df.drop(labels=["total_rooms"], axis=1)

In [None]:
plt.scatter(df["households"], df["total_bedrooms"], color="red")

In [None]:
plt.scatter(df["households"], df["population"])

In [71]:
# Drop households
df = df.drop(labels = ["households"], axis=1)

In [72]:
df.head()

Unnamed: 0,housing_median_age,total_bedrooms,population,median_income,median_house_value,x,y,z,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,41.0,129.0,322.0,8.3252,452600.0,0.983684,-0.283278,0.179903,0,0,1,0
1,21.0,1106.0,2401.0,8.3014,358500.0,0.987085,-0.293696,0.160195,0,0,1,0
2,52.0,190.0,496.0,7.2574,352100.0,0.988638,-0.275223,0.150316,0,0,1,0
3,52.0,235.0,558.0,5.6431,341300.0,0.988638,-0.265714,0.150316,0,0,1,0
4,52.0,280.0,565.0,3.8462,342200.0,0.988638,-0.265714,0.150316,0,0,1,0


In [73]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

df_ = normalize(df[["housing_median_age","total_bedrooms", "population","median_income", "median_house_value"]])
df = df.drop(labels= ["housing_median_age","total_bedrooms", "population","median_income", "median_house_value"], axis=1)
df_.head()


Unnamed: 0,housing_median_age,total_bedrooms,population,median_income,median_house_value
0,0.784314,0.019863,0.008941,0.539668,0.902266
1,0.392157,0.171477,0.06721,0.538027,0.708247
2,1.0,0.02933,0.013818,0.466028,0.695051
3,1.0,0.036313,0.015555,0.354699,0.672783
4,1.0,0.043296,0.015752,0.230776,0.674638


In [74]:
df = pd.concat([df,df_], axis=1)
df.head()

Unnamed: 0,x,y,z,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,housing_median_age,total_bedrooms,population,median_income,median_house_value
0,0.983684,-0.283278,0.179903,0,0,1,0,0.784314,0.019863,0.008941,0.539668,0.902266
1,0.987085,-0.293696,0.160195,0,0,1,0,0.392157,0.171477,0.06721,0.538027,0.708247
2,0.988638,-0.275223,0.150316,0,0,1,0,1.0,0.02933,0.013818,0.466028,0.695051
3,0.988638,-0.265714,0.150316,0,0,1,0,1.0,0.036313,0.015555,0.354699,0.672783
4,0.988638,-0.265714,0.150316,0,0,1,0,1.0,0.043296,0.015752,0.230776,0.674638


In [76]:
train_data_label= df["median_house_value"][:15000]
test_data_label =df["median_house_value"][15000:]

In [77]:
train_features = df.drop(labels= "median_house_value", axis=1)[:15000]
test_features = df.drop(labels = "median_house_value", axis=1)[15000:]

In [78]:
reg = LinearRegression().fit(train_features, train_data_label)

In [79]:
reg.coef_

array([-0.00993942, -0.05527379,  0.01308987, -0.13256515,  0.36469048,
       -0.00733717,  0.02391314,  0.09198081,  1.46390922, -2.79569914,
        1.14805331])

In [80]:
reg.intercept_ 

0.06040735000428321

In [81]:
from sklearn.metrics import mean_squared_error
pred = reg.predict(test_features)
err = mean_squared_error(test_data_label, pred)
print(err)

0.028850874797135297


In [103]:
clf = Ridge(alpha=0.0001)
clf.fit(train_features,train_data_label)

Ridge(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [104]:
pred_ridge = clf.predict(test_features)
err_ridge = mean_squared_error(test_data_label, pred_ridge)
print(err_ridge)

0.028850853489246445
