In [59]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

URL="https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df=pd.read_csv(URL)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=35)

X_train = train_set.drop("median_house_value", axis=1)
Y = train_set["median_house_value"].copy()

X_num = X_train.drop('ocean_proximity', axis=1)

In [60]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix=3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X):
    return self
  def transform(self, X):
    rooms_per_household= X[:,rooms_ix] /X[:,households_ix]
    population_per_household= X[:,population_ix] /X[:,households_ix]
    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix]/X[:,rooms_ix]
      return np.c_[X,rooms_per_household, population_per_household, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

In [61]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('str_scaler', StandardScaler())
])

num_pipeline.fit_transform(X_num)

array([[-1.25390838,  1.10757958, -1.79230424, ...,  0.10067299,
        -0.05831117, -0.44204612],
       [ 1.31649014, -0.79863258, -1.2367069 , ..., -0.04645309,
        -0.07680567, -0.14642693],
       [ 0.65894633, -0.77989831,  0.66819829, ..., -0.6536053 ,
         0.06743175,  0.76284898],
       ...,
       [ 0.80340671, -0.54103634,  0.27134305, ...,  0.34765451,
        -0.03553681, -0.50685707],
       [-1.12937357,  0.78909696, -0.52236745, ...,  0.09768495,
         0.14776252, -0.32081214],
       [ 0.62407658, -0.67217624,  0.58882724, ...,  0.35493053,
        -0.0368153 , -0.65791088]])

In [62]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs=['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [63]:
X_prepared = full_pipeline.fit_transform( X_train )

In [64]:
X_prepared

array([[-1.25390838,  1.10757958, -1.79230424, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.31649014, -0.79863258, -1.2367069 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.65894633, -0.77989831,  0.66819829, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.80340671, -0.54103634,  0.27134305, ...,  0.        ,
         0.        ,  0.        ],
       [-1.12937357,  0.78909696, -0.52236745, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.62407658, -0.67217624,  0.58882724, ...,  0.        ,
         0.        ,  0.        ]])

In [65]:
 from sklearn.linear_model import LinearRegression

In [66]:
 LR_model = LinearRegression()
 LR_model.fit(X_prepared, Y)


In [67]:
test_data = X_train.sample(10)
test_label = Y.loc[test_data.index]

In [68]:
test_data_prepared = full_pipeline.transform(test_data)


In [69]:
predicted_labels = LR_model.predict(test_data_prepared)

In [70]:
pd.DataFrame({'Bashorat': predicted_labels, "Real baxosi": test_label})

Unnamed: 0,Bashorat,Real baxosi
19910,130444.012725,92500.0
18925,259447.841782,195800.0
17192,269884.612898,295300.0
16718,214316.037031,171300.0
2086,73339.662033,56100.0
1246,130523.362983,112500.0
1824,214021.104497,261900.0
19535,97369.870433,119000.0
499,175573.10026,150800.0
19133,164594.462273,162500.0


In [72]:
X_test = test_set.drop("median_house_value", axis=1).copy()
y_test = test_set['median_house_value'].copy()

In [73]:
 X_test_prepared = full_pipeline.transform (X_test)

In [75]:
y_predicted = LR_model.predict(X_test_prepared)

In [78]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test,y_predicted)
print("MAE=", mae)

MAE= 49810.06572460955


In [79]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test,y_predicted)
print("RMSE=",np.sqrt(mse) )

RMSE= 69116.59360540153


In [80]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, Y)

In [81]:
y_predicted = RF_model.predict(X_test_prepared)

In [82]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test,y_predicted)
print("RMSE=",np.sqrt(mse) )

RMSE= 50819.64004556061


In [83]:
x = df.drop('median_house_value', axis = 1)
y = df['median_house_value'].copy()

In [84]:
x_prepared = full_pipeline.transform(x)

In [85]:
from sklearn.model_selection import cross_val_score
mse_scores = cross_val_score(LR_model, x_prepared, y, scoring= 'neg_mean_squared_error', cv = 5)

In [86]:
def display_scores (scores):
  print('scores', scores)
  print("mean", scores.mean())
  print("std.dev", scores.std())

In [87]:
display_scores(np.sqrt(-mse_scores))

scores [73389.65874804 74806.80242451 75428.90861012 76602.13620889
 66196.84790983]
mean 73284.87078027803
std.dev 3692.3833170365147


In [88]:
from sklearn.model_selection import cross_val_score
mse_scores = cross_val_score(RF_model, x_prepared, y, scoring= 'neg_mean_squared_error', cv = 5)

In [89]:
display_scores(np.sqrt(-mse_scores))

scores [77402.68356005 63934.46613566 61036.56189019 79787.69959655
 62216.30478765]
mean 68875.54319401828
std.dev 8024.918774750932


In [90]:
import joblib
filename = 'RF_model.jbl'
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [91]:
import pickle
filename = 'LR_model.pkl'
with open (filename, 'wb') as file:
  pickle.dump(LR_model, file)
