In [13]:
import numpy as np
import pandas as pd

# Problem Understanding

Your Real Estate partner in California needs your help with pricing homes at the optimal level<br>

Help them to predict the expected sale value of properties in their State and you will get slice of their additional sales commission 💸

# Data Understanding

In [14]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [15]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Data preparation

### Split your X data in train and test datasets
Here is the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [24]:
from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
#
# 
# 

In [37]:
poly = PolynomialFeatures(2)
poly.fit(X);
X_poly = pd.DataFrame(poly.transform(X))
X_poly.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,103684.0,822.888889,12197.36,-39358.06,6.530864,96.804444,-312.365556,1434.8944,-4630.0724,14940.1729
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,5764801.0,5065.730228,90901.86,-293450.22,4.451433,79.878612,-257.864868,1433.3796,-4627.2492,14937.7284
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,246016.0,1389.920904,18773.6,-60631.04,7.85266,106.065537,-342.548249,1432.6225,-4626.784,14942.6176
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,311364.0,1421.753425,21120.3,-68215.5,6.492025,96.439726,-311.486301,1432.6225,-4627.1625,14945.0625
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,319225.0,1232.528958,21385.25,-69071.25,4.758799,82.568533,-266.684363,1432.6225,-4627.1625,14945.0625


In [38]:
X_poly.shape

(20640, 45)

In [40]:
cols = poly.get_feature_names(X.columns)

In [41]:
X_poly.columns = cols
X_poly.head()
#boom

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,Population^2,Population AveOccup,Population Latitude,Population Longitude,AveOccup^2,AveOccup Latitude,AveOccup Longitude,Latitude^2,Latitude Longitude,Longitude^2
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,103684.0,822.888889,12197.36,-39358.06,6.530864,96.804444,-312.365556,1434.8944,-4630.0724,14940.1729
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,5764801.0,5065.730228,90901.86,-293450.22,4.451433,79.878612,-257.864868,1433.3796,-4627.2492,14937.7284
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,246016.0,1389.920904,18773.6,-60631.04,7.85266,106.065537,-342.548249,1432.6225,-4626.784,14942.6176
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,311364.0,1421.753425,21120.3,-68215.5,6.492025,96.439726,-311.486301,1432.6225,-4627.1625,14945.0625
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,319225.0,1232.528958,21385.25,-69071.25,4.758799,82.568533,-266.684363,1432.6225,-4627.1625,14945.0625


### Split your train data in train and validation datasets

In [76]:
from sklearn.model_selection import train_test_split
#Training dataset
X_train, X_test, y_train, y_test = train_test_split(X_poly,y,test_size=(round(20640*0.2)), random_state=80)
#Validation dataset
X_val_train, X_val, y_val_train, y_val = train_test_split(X_train,y_train,test_size=1000, random_state=25)
# 
# 
# 

### Scale the 3 datasets using StandardScaler

In [77]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
# 
# 
# 

In [78]:
scaler.fit(X_val_train)

X_val_train = scaler.transform(X_val_train)
X_val = scaler.transform(X_val)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [79]:
X_val_train = pd.DataFrame(X_val_train, columns = poly.get_feature_names(X.columns))

In [80]:
X_val_train.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,Population^2,Population AveOccup,Population Latitude,Population Longitude,AveOccup^2,AveOccup Latitude,AveOccup Longitude,Latitude^2,Latitude Longitude,Longitude^2
0,0.0,0.126976,0.107491,-0.551016,-0.102134,-0.204891,-0.073624,-0.710911,0.753792,-0.077742,...,-0.134692,-0.033172,-0.245323,0.217108,-0.010313,-0.076043,0.074823,-0.715473,0.731784,-0.755499
1,0.0,-0.769432,0.346808,-0.065027,-0.080233,0.323691,-0.012583,-0.022832,0.118671,-0.578656,...,-0.009046,0.00032,0.332541,-0.323761,-0.010034,-0.012307,0.012862,-0.052042,0.056285,-0.126596
2,0.0,-0.642803,0.825442,-0.026264,-0.122567,-0.038151,-0.077962,1.095882,-0.861515,-0.523959,...,-0.100614,-0.028477,0.047409,0.019355,-0.01033,-0.0604,0.073853,1.086322,-1.054293,0.8572
3,0.0,0.079086,1.862481,-0.101219,-0.009745,-0.476488,-0.105235,-0.738996,0.603763,-0.1112,...,-0.179237,-0.046216,-0.510133,0.484593,-0.010429,-0.104184,0.105088,-0.741957,0.722108,-0.607546
4,0.0,-0.393453,1.304075,-0.738476,0.0392,-0.331235,-0.112683,-0.87474,0.703782,-0.400805,...,-0.157104,-0.04326,-0.376867,0.341668,-0.010453,-0.111944,0.112598,-0.869308,0.84896,-0.706223


# Modelling and Model Evaluation

### Train a linear regression model

In [121]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
# 
# 
# 

In [122]:
lin_reg.fit(X_val_train, y_val_train)
trained = lin_reg.score(X_val_train, y_val_train)
trained

0.6921783325232995

In [123]:
val_coef = lin_reg.coef_[0:4]
val_coef

array([-4.85519625e-14, -2.28305157e+01, -1.04282296e+01,  1.72421178e+01])

In [124]:
lin_reg.fit(X_val, y_val)
tested = lin_reg.score(X_val,y_val)
tested


0.7528004149933573

In [125]:
test_coef = lin_reg.coef_[0:4]
test_coef

array([ 2.25845109e-11, -2.02724704e+01, -1.51535581e+01,  1.19705633e+01])

In [None]:
# I got quite lost among these coefficients. 

### Measure the R-squared, MSE and MAE of your model
Here is the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [130]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print(mean_squared_error(test_coef, val_coef), mean_absolute_error(test_coef, val_coef), r2_score(test_coef, val_coef))
# 
# 
# 
# 
# 

14.165402911107657 3.1387320637525327 0.9123371153102118


### Train a LASSO model

In [127]:
from sklearn.linear_model import Lasso
lasso = Lasso(.01)
lasso.fit(X_val_train, y_val_train)
lasso.score(X_val_train, y_val_train)
# 
# 
# 



0.6147960792739541

In [131]:
lasso = lasso.coef_[:4]
lasso

array([ 0.        ,  0.37344992,  0.        , -0.04229307])

### Measure the R-squared, MSE and MAE of your model

In [132]:
print(mean_squared_error(test_coef, lasso), mean_absolute_error(test_coef, lasso), r2_score(test_coef, lasso))
# 
# 
# 
# 

200.0482671839004 11.953083710516356 -0.23800277963028504


# Interprete your winning model

### What can you tell your business partner by looking at the coefficients?

In [133]:
# I do not believe I have achieved victory. 
# In my original model I had an unrealistic R2 number ~0.9
# In my lasso model I have a negative R2 number which essentially means my final model fits worse than a horizontal line!
# I would not be able to see my business partner about these findings!!
# 
# 