<a href="https://colab.research.google.com/github/Sakib0626Q2/AI-ML-Project-real-estate-Boston-pricing/blob/main/RealEstateHousingBoston.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load

In [None]:
housing = pd.read_csv("./HousingBoston.csv")
housing.head()

In [None]:
housing.tail()

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
housing['CHAS'].value_counts()

In [None]:
housing['CRIM'].value_counts()

In [None]:
housing.hist(bins=50, figsize = (20, 15))

## Train-Test Splitting

In [2]:
# for learning purpose

def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    print(shuffled)
    test_set_size = int(len(data)* test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [3]:
# train_set, test_set = split_train_test(housing, 0.2)

In [4]:
# print(f"Rows in train set: {len(train_set)} \nRows in test set: {len(test_set)}")

In [5]:
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
print(f"Rows in train set: {len(train_set)} \nRows in test set: {len(test_set)}")

NameError: name 'train_test_split' is not defined

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_train_set['CHAS'].value_counts()

In [None]:
strat_test_set['CHAS'].value_counts()

In [None]:
376/28

In [None]:
95/7

In [None]:
housing = strat_train_set.copy()

## Looking for Correlations

In [None]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending = 0)

In [None]:
corr_matrix['MEDV']

In [None]:
attributes = ["MEDV", "RM", "ZN", "LSTAT"]
scatter_matrix(housing[attributes], figsize = (12, 8))

In [None]:
housing.plot(kind = "scatter", x = "RM", y = "MEDV", alpha = 0.4)

### Attribute Combinations

In [None]:
housing["TAXRM"] = housing['TAX']/housing['RM']
housing["TAXRM"]

In [None]:
housing.head()

In [6]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending = 0)

NameError: name 'housing' is not defined

In [None]:
housing.plot(kind = "scatter", x = "TAXRM", y = "MEDV", alpha = 0.4)

In [None]:
housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

In [None]:
a = housing.dropna(subset = ["RM"])
a.shape

In [None]:
median = housing["RM"].median()
median

In [None]:
housing["RM"]

In [None]:
housing["RM"].fillna(median)

In [None]:
housing.shape

In [7]:
housing.describe()

NameError: name 'housing' is not defined

In [None]:
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

In [None]:
imputer.statistics_

In [None]:
imputer.statistics_.shape

In [None]:
X = imputer.transform(housing)

In [None]:
housing_tr = pd.DataFrame(X, columns = housing.columns)

In [None]:
housing_tr.describe()

## Scikit-learn Design

### Creating a Pipeline

In [None]:
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [None]:
housing_num_tr = my_pipeline.fit_transform(housing_tr)

In [None]:
housing_num_tr

### Selecting a desired model

In [None]:
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

In [None]:
list(some_labels)

In [None]:
housing_predictions = model.predict(housing_num_tr)
mse = metrics.mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [None]:
rmse

## Using better evalution technique - Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)


In [None]:
rmse_scores

In [None]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
print_scores(rmse_scores)

## Saving The Model

In [None]:
dump(model, 'realestate_model.joblib')

## Testing the Model on test data

In [None]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = metrics.mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

In [None]:
final_rmse

In [1]:
prepared_data[0]

NameError: name 'prepared_data' is not defined