In [173]:
import numpy as np
import pandas as pd
from scipy.stats import zscore

In [174]:
df = pd.read_csv('housing.csv')

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [175]:
df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [176]:
num_features = ['longitude','latitude','housing_median_age','total_rooms', 'total_bedrooms','population','households','median_income',]
cat_features = ['ocean_proximity']


In [177]:
for col in num_features:
    col_zscore = col + '_zscore'
    #X[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
    df = df[(np.abs( (df[col] - df[col].mean())/df[col].std(ddof=0))< 3)]
    
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18897 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           18897 non-null  float64
 1   latitude            18897 non-null  float64
 2   housing_median_age  18897 non-null  float64
 3   total_rooms         18897 non-null  float64
 4   total_bedrooms      18897 non-null  float64
 5   population          18897 non-null  float64
 6   households          18897 non-null  float64
 7   median_income       18897 non-null  float64
 8   median_house_value  18897 non-null  float64
 9   ocean_proximity     18897 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [178]:
X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value']
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18897 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           18897 non-null  float64
 1   latitude            18897 non-null  float64
 2   housing_median_age  18897 non-null  float64
 3   total_rooms         18897 non-null  float64
 4   total_bedrooms      18897 non-null  float64
 5   population          18897 non-null  float64
 6   households          18897 non-null  float64
 7   median_income       18897 non-null  float64
 8   ocean_proximity     18897 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.4+ MB


In [179]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=24)

print(X_train.shape)
print(X_test.shape)


(17007, 9)
(1890, 9)


In [180]:


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer()),
        ('scaler', StandardScaler()),
        ]
)
cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]
)
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features),
    ]
)
from sklearn.linear_model import LinearRegression

lin_reg_full_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
       ('lin_reg', LinearRegression()),
    ]
)

In [181]:
lin_reg_full_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('num_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                  

In [182]:
housing_y_pred = lin_reg_full_pipeline.predict(X_test)
housing_y_pred

array([267726.72991911, 118672.65418296, 126175.21070172, ...,
       265179.5831107 ,  29228.74257399, 151195.90659638])

In [183]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, housing_y_pred) 
lin_rmse = np.sqrt(lin_mse)
lin_rmse

64906.60095037361

The mse is the mean error for of all the errors from my prediction, basically i add up all the difference from my prediction array and the real data(y_test) and square it than i build the mean.
Rmse is just the root of this error make sense becaus befrore that i took a sqaure of the difference.
This means in Average the error here, is about 65007 Dollars, thats pretty high