## Boston House prices

In [2]:
import pandas as pd, matplotlib.pyplot as plt, numpy as np

In [125]:
housing = pd.read_csv("../input/real-estate-dataset/data.csv")

## Genral data info

In [126]:
housing.head()

In [127]:
housing.info()

In [128]:
housing.describe()

In [129]:
%matplotlib inline
housing.hist(bins=40, figsize=(20, 20))

## Train-Test Splitting

In [130]:
from sklearn.model_selection import train_test_split
train_set, test_set  = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

In [131]:
train_set.info()

In [132]:
housing = train_set.copy()

## Correlation analysis

In [133]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

In [134]:
from pandas.plotting import scatter_matrix
attributes = ["MEDV", "RM", "B", "LSTAT", 'TAX']
scatter_matrix(housing[attributes], figsize = (16,12))

In [135]:
housing = train_set.drop("MEDV", axis=1)
housing_labels = train_set["MEDV"].copy()
housing_test = test_set.drop("MEDV", axis=1) 
housing_test_labels = test_set["MEDV"].copy()

## Handling missing data and scalling

In [136]:
housing.info()

In [137]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [138]:
housing_transformed = my_pipeline.fit_transform(housing)
housing_test_trasnformed = my_pipeline.fit_transform(housing_test)

## Simple model validation

In [139]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_transformed, housing_labels)

In [140]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_test_trasnformed)
mse = mean_squared_error(housing_test_labels, housing_predictions)
np.sqrt(mse)

## Cross Validation

In [141]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_transformed, housing_labels, scoring="neg_mean_squared_error", cv=5)
rmse_scores=np.sqrt(-scores)

In [142]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())
print_scores(rmse_scores)

## Saving the model

In [143]:
from joblib import dump, load
dump(model, 'Model.joblib')

## Testing the model on test data

In [144]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

In [147]:
final_rmse

In [146]:
prepared_data[0]

## Using the model

In [149]:
from joblib import dump, load
import numpy as np
model = load('Model.joblib') 
features = np.array([[-5.43942006, 4.12628155, -1.6165014, -0.67288841, -1.42262747,
       -11.44443979304, -49.31238772,  7.61111401, -26.0016879 , -0.5778192 ,
       -0.97491834,  0.41164221, -66.86091034]])
model.predict(features)