In [29]:
import pandas as pd
import numpy as np
import sklearn

In [30]:
df= pd.read_csv('housing.csv')

In [31]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())             
])

In [34]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [35]:
X_prepared = full_pipeline.fit_transform(X_train)

In [36]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [37]:
from sklearn.linear_model import LinearRegression
linear_regration = LinearRegression()

In [38]:
linear_regration.fit(X_prepared,y)

In [42]:
test_data= X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
6540,-118.04,34.04,32.0,1619.0,323.0,1492.0,342.0,3.5,<1H OCEAN
6073,-117.85,34.07,32.0,761.0,101.0,295.0,95.0,11.1077,<1H OCEAN
14241,-117.09,32.69,18.0,1645.0,430.0,1221.0,410.0,1.3269,NEAR OCEAN
13521,-117.41,34.24,20.0,1160.0,181.0,543.0,188.0,5.2072,INLAND
2218,-119.82,36.83,14.0,2982.0,412.0,1408.0,423.0,5.3241,INLAND
18493,-121.59,37.02,14.0,6355.0,1279.0,3704.0,1224.0,4.4233,INLAND
8120,-118.18,33.79,42.0,1571.0,435.0,1631.0,417.0,1.6384,NEAR OCEAN
16692,-120.61,35.12,12.0,3430.0,793.0,1840.0,720.0,2.9821,<1H OCEAN
15053,-116.83,32.83,6.0,3123.0,495.0,1513.0,480.0,5.4288,<1H OCEAN
6995,-118.03,33.96,37.0,1180.0,256.0,614.0,242.0,3.117,<1H OCEAN


In [43]:
test_label = y.loc[test_data.index]
test_label

6540     165100.0
6073     500001.0
14241    108000.0
13521    164300.0
2218     123000.0
18493    228600.0
8120     128000.0
16692    162000.0
15053    167800.0
6995     164600.0
Name: median_house_value, dtype: float64

In [45]:
test_data_prepared = full_pipeline.transform(test_data)
predicted_labels= linear_regration.predict(test_data_prepared)

In [46]:
predicted_labels

array([174802.27719093, 492583.82369964, 116367.2091945 , 185457.07387818,
       177662.35950949, 208408.22741545, 145257.23261809, 220174.38792551,
       237429.58531261, 194942.2487302 ])

In [48]:
pd.DataFrame({'Bashorat':predicted_labels,'Asl qiymat':test_label})

Unnamed: 0,Bashorat,Asl qiymat
6540,174802.277191,165100.0
6073,492583.8237,500001.0
14241,116367.209194,108000.0
13521,185457.073878,164300.0
2218,177662.359509,123000.0
18493,208408.227415,228600.0
8120,145257.232618,128000.0
16692,220174.387926,162000.0
15053,237429.585313,167800.0
6995,194942.24873,164600.0
