<a href="https://colab.research.google.com/github/UmiraOzawa/data_science_chay_cmn_nha/blob/main/hand_on_ml2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as tfl
import numpy as np
import pandas as pd
import sklearn.neighbors as skn
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
training_data = pd.read_csv("/content/sample_data/california_housing_train.csv")

housing = training_data.drop("median_house_value", axis =1)
housing_value = training_data[["median_house_value"]].copy()

#is_null là hàm xuất ra True nếu có giá trị null, any là hàm xuất ra True nếu có bất kì giá trị nào theo axis là True, nếu ko có thì là False
sample_incomplete_rows = housing[housing.isnull().any(axis =1)]
print(sample_incomplete_rows)

#fill những giá trị còn trống trong collumn "total_bedrooms" bằng dropna, hoặc dùng SimpleImputer của sklearn để fill hết vào những ô trống. 
housing.dropna(subset=["total_bedrooms"])
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)
X = imputer.transform(housing)
housing = pd.DataFrame(X, columns = housing.columns, index = housing.index)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_tr = num_pipeline.fit_transform(housing)
print(housing.corr())
model = KNeighborsRegressor(n_neighbors=7)
model.fit(housing_tr,housing_value)

Empty DataFrame
Columns: [longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income]
Index: []
                    longitude  latitude  ...  households  median_income
longitude            1.000000 -0.925208  ...    0.059628      -0.015485
latitude            -0.925208  1.000000  ...   -0.074902      -0.080303
housing_median_age  -0.114250  0.016454  ...   -0.302754      -0.115932
total_rooms          0.047010 -0.038773  ...    0.919018       0.195383
total_bedrooms       0.071802 -0.069373  ...    0.980920      -0.013495
population           0.101674 -0.111261  ...    0.909247      -0.000638
households           0.059628 -0.074902  ...    1.000000       0.007644
median_income       -0.015485 -0.080303  ...    0.007644       1.000000

[8 rows x 8 columns]


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                    weights='uniform')

In [None]:
test_data = pd.read_csv("/content/sample_data/california_housing_test.csv")

housing_test = test_data.drop("median_house_value", axis =1)
housing_value_test = test_data[["median_house_value"]].copy()

housing_tr_test = num_pipeline.fit_transform(housing_test)

print(model.predict(housing_tr_test))
housing_value_test

[[387785.85714286]
 [228600.        ]
 [280100.        ]
 ...
 [ 81271.42857143]
 [167285.71428571]
 [496029.42857143]]


Unnamed: 0,median_house_value
0,344700.0
1,176500.0
2,270500.0
3,330000.0
4,81700.0
...,...
2995,225000.0
2996,237200.0
2997,62000.0
2998,162500.0
