In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.datasets import fetch_california_housing

In [4]:
housing_data = fetch_california_housing()

In [5]:
print(housing_data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [6]:
col_name = housing_data.feature_names
col_name

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [7]:
data = pd.DataFrame(housing_data.data, columns = col_name)
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
data.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
3669,2.1518,43.0,3.728125,0.934375,1184.0,3.7,34.23,-118.39
2583,1.1576,44.0,3.279621,0.976303,398.0,1.886256,40.87,-124.09
18791,1.7935,20.0,5.628571,1.190476,512.0,2.438095,40.75,-122.56
2921,2.2827,48.0,5.049587,1.090909,947.0,2.608815,35.36,-119.02
813,3.7262,36.0,5.014235,0.964413,1002.0,3.565836,37.61,-122.03


In [9]:
housing_data.target_names

['MedHouseVal']

In [10]:
data['MedHouseVal']= housing_data.target

In [11]:
data.sample(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
7279,1.6667,44.0,3.506452,1.080645,1441.0,4.648387,33.98,-118.25,1.125
8618,6.4242,12.0,6.200586,1.076135,1798.0,2.632504,33.86,-118.38,3.655
13237,7.9627,8.0,8.234463,1.048023,1271.0,3.590395,34.13,-117.69,3.454
6185,4.375,33.0,4.888041,1.010178,1492.0,3.796438,34.1,-117.92,1.505
4026,6.1274,11.0,6.75,1.0875,386.0,2.4125,34.17,-118.54,3.159
16638,3.2112,20.0,6.517308,1.353846,1309.0,2.517308,35.6,-121.1,2.045
16636,2.5172,16.0,6.15047,1.095611,1325.0,2.076803,35.31,-120.82,2.939
14613,2.2663,44.0,3.442922,0.984779,3009.0,4.579909,32.79,-117.17,1.236
15827,3.0,52.0,4.181818,1.058824,1162.0,3.106952,37.75,-122.42,2.75
9058,3.4327,35.0,5.883333,1.038889,1108.0,3.077778,34.69,-118.14,1.003


In [12]:
data.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [13]:
data.to_csv('california_housing.csv', index = False)

In [14]:
data.to_excel('california_housing.xlsx')

In [15]:
data.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
2742,5.7843,5.0,5.92399,0.983373,1416.0,3.36342,32.78,-115.58,1.101
14652,2.3578,41.0,5.455598,1.007722,1070.0,4.131274,32.8,-117.15,1.667
18843,1.7292,31.0,5.298755,1.091286,600.0,2.489627,41.46,-122.9,0.617
9402,3.2794,48.0,5.031373,1.090196,567.0,2.223529,37.91,-122.55,4.6
1050,2.7236,19.0,5.772085,1.084806,1252.0,2.212014,38.36,-120.69,1.099


In [16]:
data.isna().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [17]:
data.shape

(20640, 9)

In [18]:
data.drop_duplicates(inplace = True)

In [19]:
data.shape

(20640, 9)

In [20]:
x = data.drop(['MedHouseVal'], axis = 1)
y = data['MedHouseVal']

x:(x1 x2,x3...xn)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y, 
                                                     test_size = 0.2, 
                                                     random_state = 666, 
                                                     shuffle=True)

In [22]:
x_test.shape

(4128, 8)

In [23]:
x_train.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
19792,1.625,25.0,6.338843,1.702479,301.0,2.487603,40.77,-123.28
17001,6.5948,23.0,6.673623,1.046745,3484.0,2.90818,37.57,-122.26
11142,3.5455,34.0,5.637615,0.986239,1035.0,4.747706,33.84,-117.95
3887,2.9389,18.0,4.308966,1.097931,1855.0,2.558621,34.21,-118.53
14713,5.7416,25.0,7.154286,0.985714,1067.0,3.048571,32.8,-117.04


In [24]:
x_train.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
MedInc,1.0,-0.116846,0.31917,-0.061197,0.002839,0.02185,-0.079627,-0.015013
HouseAge,-0.116846,1.0,-0.15001,-0.074067,-0.292521,0.012795,0.005927,-0.10256
AveRooms,0.31917,-0.15001,1.0,0.854217,-0.072484,-0.008535,0.105857,-0.028308
AveBedrms,-0.061197,-0.074067,0.854217,1.0,-0.064795,-0.006789,0.070774,0.010582
Population,0.002839,-0.292521,-0.072484,-0.064795,1.0,0.063033,-0.108795,0.100166
AveOccup,0.02185,0.012795,-0.008535,-0.006789,0.063033,1.0,0.006487,0.001432
Latitude,-0.079627,0.005927,0.105857,0.070774,-0.108795,0.006487,1.0,-0.924894
Longitude,-0.015013,-0.10256,-0.028308,0.010582,0.100166,0.001432,-0.924894,1.0


In [25]:
x_train.drop(['AveBedrms'], axis = 1, inplace = True)
x_test.drop(['AveBedrms'], axis=1, inplace = True)

In [26]:
from sklearn.preprocessing import RobustScaler, StandardScaler

In [27]:
#scaler = RobustScaler()
#scaler = StandardScaler()

In [29]:
#x_train = scaler.fit_transform(x_train)
#x_test = scaler.transform(x_test)

In [30]:
model = LinearRegression(n_jobs = -1)

In [31]:
model.fit(x_train,y_train)

In [32]:
y_pred = model.predict(x_test)

In [33]:
mean_absolute_error(y_test, y_pred)

0.5496566151278703

In [34]:
r2_score(y_test, y_pred)

0.5763460193161879

In [36]:
model.coef_

array([ 3.70657695e-01,  9.91187780e-03,  1.72181326e-02, -2.96145535e-06,
       -3.71117556e-03, -4.57042846e-01, -4.64610219e-01])

In [37]:
model.intercept_

-38.99521181864263

# Tree based method

In [38]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [39]:
rand_forest = RandomForestRegressor(n_jobs= -1, n_estimators = 100)

In [41]:
rand_forest.fit(x_train, y_train)

In [42]:
y_pred = rand_forest.predict(x_test)

In [43]:
r2_score(y_test, y_pred)

0.8052766282304673