In [4]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score 

In [5]:
#load the cleaned dataset
data = pd.read_csv("cleaned_housing.csv")

In [6]:
#explore the dataset
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41,880,129,322,126,8.3252,452600
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500
2,-122.24,37.85,52,1467,190,496,177,7.2574,352100
3,-122.25,37.85,52,1274,235,558,219,5.6431,341300
4,-122.25,37.85,52,1627,280,565,259,3.8462,342200


In [7]:
#split into features{X) and target(y)
X = data.drop('median_house_value', axis=1)
#features. so we pick all columns except house_value
y = data['median_house_value']

In [8]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2, #20% of dataset should be for testing. 80/20 split
    random_state = 89 #seeding value
) 

In [9]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
10144,-117.86,33.89,22,4386,593,1915,592,6.6897
20157,-118.94,34.17,15,1679,271,928,264,5.5681
12433,-121.48,38.56,52,814,216,327,181,2.8542
3825,-118.43,34.17,34,2180,424,906,429,4.4464
18761,-122.08,38.30,2,6718,858,2012,654,6.8872
...,...,...,...,...,...,...,...,...
17490,-121.89,37.28,35,2418,375,988,374,6.0936
19898,-120.24,37.96,34,1747,395,935,362,1.6250
5419,-118.46,34.00,52,888,206,376,194,3.8750
19309,-121.02,37.67,32,3951,797,1916,740,2.6722


In [10]:
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
11312,-117.99,33.70,25,2017,357,1063,369,4.0345
14365,-117.22,32.87,14,3512,807,1835,792,3.3500
12173,-116.97,33.93,29,2793,722,1583,626,1.4240
11396,-118.11,33.73,32,1258,333,645,334,5.0476
5730,-118.24,34.14,20,3196,994,2929,983,3.0206
...,...,...,...,...,...,...,...,...
9422,-123.21,39.13,27,1531,266,822,234,4.0469
5702,-118.29,34.17,12,2238,682,1882,611,2.9000
16645,-122.41,37.66,32,1385,356,1096,353,4.4750
16215,-121.29,38.03,16,4356,881,1629,818,2.2672


In [11]:
#create and train the model
model = LinearRegression()
model.fit(X_train, y_train) 

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:
#make predictions
predictions = model.predict(X_test)

In [13]:
predictions

array([216789.31558925, 208090.11778913,  90590.10750207, ...,
       265365.95638487, 132050.77242635, 192247.50546947], shape=(4087,))

In [14]:
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

In [15]:
r2
mse

4847420158.946786

In [16]:
r2

0.6394992760941295

In [17]:
mse

4847420158.946786

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
ridge = Ridge(alpha=1.0)#create model
ridge.fit(X_train_scaled, y_train)#train it

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [21]:
y_pred_ridge = ridge.predict(X_test_scaled)

In [22]:
#compare prediction
print("Actual: ", y_test.values)
print("Predicted: ", y_pred_ridge)

Actual:  [229400 171000  73200 ... 246700  91100 136200]
Predicted:  [216786.71744799 208074.24095293  90617.68356955 ... 265327.86338944
 132063.2087247  192251.36121687]


In [23]:
#evaluate prediction
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
mse_ridge

4847520994.482112

In [24]:
r2_ridge

0.6394917769951679

In [25]:
import pickle

In [29]:
pickle.dump(model, open("model.pkl","wb"))

In [27]:
pickle.dump(ridge, open("ridge.pkl", "wb"))

In [28]:
pickle.dump(scaler, open("scaler.pkl", "wb"))