In [25]:
import pandas as pd

In [26]:
import numpy as np

In [27]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [28]:
from sklearn.impute import SimpleImputer

In [29]:
from sklearn.model_selection import StratifiedShuffleSplit

In [30]:
from sklearn.pipeline import Pipeline

In [31]:
from sklearn.compose import ColumnTransformer

In [32]:
df=pd.read_csv(r"C:\Users\sarth\Downloads\AI\housing.csv")

In [33]:
df['income_cat']=pd.cut(df['median_income'],
                       bins=[0,1.5,3.0,4.5,6.0,np.inf],
                       labels=[1,2,3,4,5])


In [37]:
split=StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [38]:
for train_index,test_index in split.split(df, df['income_cat']):
    strat_train_set=df.loc[train_index].drop("income_cat",axis=1)
    strat_test_set=df.loc[test_index].drop("income_cat",axis=1)

In [63]:
housing=strat_train_set.copy()

In [65]:
housing=housing.drop("median_house_value", axis=1)

In [64]:
housing_label=housing["median_house_value"].copy()

In [67]:
num_attribures=housing.drop("ocean_proximity", axis=1).columns.tolist()

In [68]:
cat_attributes=['ocean_proximity']

In [70]:
num_pipeline=Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("standardization", StandardScaler())
])

In [71]:
cat_pipeline=Pipeline([
    ("OnehotEncoder", OneHotEncoder(handle_unknown='ignore'))
])

In [74]:
full_pipeline=ColumnTransformer([
    ("num",num_pipeline, num_attribures),
    ("cat",cat_pipeline, cat_attributes)
])

In [76]:
housing_prepared=full_pipeline.fit_transform(housing)

In [77]:
print(housing_prepared.shape)

(16512, 13)


In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [84]:
lm=LinearRegression()
lm.fit(housing_prepared, housing_label)

In [89]:
dt=DecisionTreeRegressor(random_state=42)
dt.fit(housing_prepared, housing_label)

In [91]:
rf=RandomForestRegressor(random_state=42)
rf.fit(housing_prepared, housing_label)

In [92]:
lm_prediction=lm.predict(housing_prepared)
dt_prediction=dt.predict(housing_prepared)
rf_prediction=rf.predict(housing_prepared)

In [102]:
lm_error=mean_squared_error(housing_label, lm_prediction, squared=False)
dt_error=mean_squared_error(dt_prediction, housing_label, squared=False)
rf_error=mean_squared_error(rf_prediction, housing_label, squared=False)




In [103]:
print(lm_error)

69050.56219504567


In [110]:
from sklearn.model_selection import cross_val_score
cross_val_rmse=-cross_val_score(
    dt,
    housing_prepared,
    housing_label,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print(cross_val_rmse)
print

In [111]:
print(pd.Series(cross_val_rmse).describe())

count       10.000000
mean     69081.361563
std       2420.500173
min      64770.563939
25%      67525.053996
50%      69027.994020
75%      70675.556581
max      73280.387324
dtype: float64
