In [2]:
import pandas as pd
import numpy as np
import sklearn

In [3]:
df= pd.read_csv('housing.csv')

In [4]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())             
])

In [7]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [8]:
X_prepared = full_pipeline.fit_transform(X_train)

In [9]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [10]:
from sklearn.linear_model import LinearRegression
linear_regration = LinearRegression()

In [11]:
linear_regration.fit(X_prepared,y)

In [12]:
test_data= X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
10807,-117.93,33.65,35.0,2133.0,413.0,1473.0,402.0,4.4211,<1H OCEAN
86,-122.27,37.81,40.0,880.0,451.0,582.0,380.0,0.977,NEAR BAY
14412,-117.23,32.78,35.0,1649.0,355.0,746.0,360.0,4.6293,NEAR OCEAN
4217,-118.27,34.11,39.0,3825.0,916.0,1378.0,746.0,4.4094,<1H OCEAN
14208,-117.04,32.69,27.0,1790.0,356.0,1286.0,347.0,3.5437,NEAR OCEAN
17203,-119.73,34.43,35.0,2703.0,654.0,1383.0,631.0,4.5278,NEAR OCEAN
17932,-121.94,37.35,52.0,906.0,227.0,1662.0,219.0,3.1667,<1H OCEAN
13540,-117.3,34.14,39.0,1781.0,335.0,841.0,320.0,1.9432,INLAND
5757,-118.28,34.18,50.0,2195.0,336.0,878.0,309.0,6.884,<1H OCEAN
17315,-120.18,34.62,25.0,1337.0,219.0,671.0,225.0,3.1912,NEAR OCEAN


In [13]:
test_label = y.loc[test_data.index]
test_label

10807    215200.0
86       118800.0
14412    356500.0
4217     352600.0
14208    115800.0
17203    340400.0
17932    231600.0
13540     89000.0
5757     365600.0
17315    226400.0
Name: median_house_value, dtype: float64

In [14]:
test_data_prepared = full_pipeline.transform(test_data)
predicted_labels= linear_regration.predict(test_data_prepared)

In [15]:
predicted_labels

array([233318.73872913, 215980.59090831, 275338.25122902, 298407.75376911,
       192683.45811673, 313631.87557749, 195112.14282591,  83822.74768088,
       355553.81133429, 214302.20618933])

In [16]:
pd.DataFrame({'Bashorat':predicted_labels,'Asl qiymat':test_label})

Unnamed: 0,Bashorat,Asl qiymat
10807,233318.738729,215200.0
86,215980.590908,118800.0
14412,275338.251229,356500.0
4217,298407.753769,352600.0
14208,192683.458117,115800.0
17203,313631.875577,340400.0
17932,195112.142826,231600.0
13540,83822.747681,89000.0
5757,355553.811334,365600.0
17315,214302.206189,226400.0


In [17]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [18]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [19]:
X_test = test_set.drop('median_house_value',axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [20]:
y_test = test_set['median_house_value'].copy()

In [21]:
X_test_prepared = full_pipeline.transform(X_test)

In [22]:
y_predicted = linear_regration.predict(X_test_prepared)
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [23]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test,y_predicted)

In [24]:
print('MAE=',mae)

MAE= 50898.73953494079


In [25]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_predicted)

In [26]:
print('MSE=',np.sqrt(mse))

MSE= 72701.32600762136


In [30]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared,y)

In [32]:
y_predicted=RF_model.predict(X_test_prepared)

In [33]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_predicted)
print('MSE=',np.sqrt(mse))

MSE= 50205.04287409315


In [34]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)

In [35]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [36]:
from sklearn.model_selection import cross_val_score

In [38]:
scores = cross_val_score(linear_regration , X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [40]:
display_scores(LR_rmse_scores)

Scores: [84188.51219065 61197.24357613 86752.24346334 62289.14292385
 80540.40041898 68919.39949642 52503.82940087 90910.07884989
 77674.67507925 53941.60539478]
Mean: 71891.71307941683
Std.dev: 13249.525989444985
