In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].mean())

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import joblib

In [10]:
x = df.drop('median_house_value', axis=1) #future

In [11]:
y = df['median_house_value'] #target

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
sc = StandardScaler()
oe = OneHotEncoder()

In [14]:
numerical = x_train.select_dtypes(include='float').columns

In [15]:
categorical = x_train.select_dtypes(include='object').columns

In [16]:
processor = ColumnTransformer(
    transformers=[
        ('num', sc, numerical),
        ('obj', oe, categorical)
    ]
)

In [17]:
model = make_pipeline(processor, RandomForestRegressor())

In [18]:
model.fit(x_train, y_train)

In [19]:
preds = model.predict(x_test)

In [22]:
mse = mean_squared_error(y_test,preds)

In [23]:
import numpy as np

In [24]:
print('MSE: ', np.sqrt(mse))

MSE:  49196.42453135371


In [25]:
joblib.dump(model, 'house_pricing_forest.joblib')

['house_pricing_forest.joblib']

In [26]:
load_model = joblib.load('house_pricing_forest.joblib')

In [27]:
test_household_data = pd.DataFrame({
    'longitude': [-118.45],
    'latitude': [34.05],
    'housing_median_age': [25.0],
    'total_rooms': [3500.0],
    'total_bedrooms': [650.0],
    'population': [1800.0],
    'households': [600.0],
    'median_income': [5.5],
    'ocean_proximity': ['<1H OCEAN']
})

In [28]:
prediction = load_model.predict(test_household_data)

In [29]:
print('Narx: ', prediction[0])

Narx:  428453.58
