# Dragon Real Estate - Price Predictor

In [252]:
import pandas as pd

In [253]:
housing = pd.read_csv("data.csv")

In [254]:
%matplotlib inline

In [255]:
import matplotlib.pyplot as plt

## Train Test Split

In [256]:
# import numpy as np

# def split_train_test(data, test_ratio):
#     np.random.seed(42)
#     shuffled = np.random.permutation(len(data))
#     test_set_size =int(len(data) * test_ratio)
#     test_indices = shuffled[:test_set_size]
#     train_indices = shuffled[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_set_size]

In [257]:
# train_set , test_set = split_train_test(housing, 0.2)

In [258]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

In [259]:
print(f"Rows in train set : {len(train_set)} \nRows in test set: {len(test_set)} ")

Rows in train set : 404 
Rows in test set: 102 


In [260]:
from sklearn.model_selection import StratifiedShuffleSplit 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index , test_index in split.split(housing, housing["CHAS"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [261]:
housing = strat_train_set.copy()

## Looking for correlations

In [262]:
# from pandas.plotting import scatter_matrix
# attributes = ['MEDV','RM', 'ZN','LSTAT']
# scatter_matrix(housing[attributes], figsize=(12,8))

In [263]:
# housing.plot(kind="scatter", x ='RM', y='MEDV', alpha=0.8)

## Trying out Attribute combinations 

In [264]:
housing['TAXRM'] = housing['TAX']/housing['RM']
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,TAXRM
254,0.04819,80.0,3.64,0,0.392,6.108,32.0,9.2203,1,315,16.4,392.89,6.57,21.9,51.571709
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.344,4,280,17.0,390.94,5.99,24.5,42.200452
476,4.87141,0.0,18.1,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7,102.714374
321,0.18159,0.0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.9,6.87,23.1,45.012547
326,0.30347,0.0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.9,6.15,23.0,45.468948


In [265]:
# corr_matrix = housing.corr()
# corr_matrix["MEDV"].sort_values(ascending=False)

In [266]:
housing = strat_train_set.drop('MEDV', axis = 1)
housing_labels = strat_train_set['MEDV'].copy()

## Missing Attributes


In [267]:
# to take care of missing attribuets , u have 3 options
    # TO GET RID OF THE MISSING DATA POINTS
    # GET RID OF THE WHOLE ATTRIBUTE
    # SET THE VALUE TO SOME VALUE (0, MEAN, MEDIAN)
# from sklearn.impute  import SimpleImputer
# imputer = SimpleImputer(strategy = "median")
# imputer.fit(housing)


In [268]:
# imputer.statistics_

In [269]:
# X = imputer.transform(housing)

In [270]:
# housing_tr = pd.DataFrame(X, columns=housing.columns)

In [271]:
# housing_tr.describe()

## Scikit-learn Design

#### Primarily three types of objects 1.Estimators 2. Transformers 3. Predictors
#### Estimators -  Ex - Imputer
#### Transformers - transforms input and gives output
#### Predictors - Linear Regression , K nearest neighbour 

In [272]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scalar', StandardScaler())
])

## Feature Scaling

In [273]:
housing_num = my_pipeline.fit_transform(housing)

In [274]:
housing_num
housing_num.shape

(404, 13)

#### Select and train model

In [275]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num, housing_labels)

RandomForestRegressor()

In [276]:
some_data = housing.iloc[:5]

In [277]:
some_labels = housing_labels.iloc[:5]

In [278]:
prepared_data = my_pipeline.transform(some_data)

In [279]:
model.predict(prepared_data)

array([22.267, 25.21 , 16.54 , 23.315, 23.554])

In [280]:
list(some_labels)

[21.9, 24.5, 16.7, 23.1, 23.0]

### Evaluating the model

In [281]:
import numpy as np
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [282]:
rmse

1.1851244900560836

### Cross Validation

In [283]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num, housing_labels, scoring = "neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [284]:
rmse_scores

array([2.74320452, 2.71642412, 4.36863552, 2.54022754, 3.64696838,
       2.69230391, 4.87442396, 3.30528071, 3.20117555, 3.32318926])

In [285]:
def print_scores(scores):
    print("scores :", scores)
    print("mean:", scores.mean())
    print("Standard Deviation : ", scores.std())

In [286]:
print_scores(rmse_scores)

scores : [2.74320452 2.71642412 4.36863552 2.54022754 3.64696838 2.69230391
 4.87442396 3.30528071 3.20117555 3.32318926]
mean: 3.3411833480685935
Standard Deviation :  0.730183654072389


# Saving the model

In [287]:
from joblib import dump, load
dump(model, 'Dragon.joblib')

['Dragon.joblib']

## Testing the model

In [295]:
X_test = strat_test_set.drop('MEDV', axis =1)
Y_test = strat_test_set['MEDV'].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse =mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

[24.741 11.401 25.532 21.893 18.655 15.25  20.402 14.589 31.591 41.818
 19.325 11.842 23.671 28.397 19.626 11.119 30.94  14.589 23.731 18.723
 19.984 17.469 17.312 22.206 18.779 31.563 16.316 32.447  8.77  33.244
 23.357 21.292 23.219 11.067 21.499 10.947 43.094 24.732 23.362 42.931
 24.012 30.907 20.032 20.834 18.614 33.304 44.393 20.139 20.259 21.316
 21.256 14.529 21.572 14.817 25.043 34.    42.448 29.456 19.511 20.906
 46.552  9.369 18.949 25.83  14.721 33.814 19.941 17.804 18.666 34.766
 25.788 23.063 21.742 22.592 34.942 12.317 15.933 20.044 20.994 21.401
 22.295 20.986 14.147 23.246 20.558 21.3   13.796 21.344 21.94  23.346
 18.576 27.63   7.134 26.399 17.843 29.335 20.118 31.194 14.745 26.69
 21.206 20.849] [16.5, 10.2, 30.1, 23.0, 14.4, 15.6, 19.4, 14.1, 30.3, 35.2, 23.1, 13.8, 25.0, 27.9, 19.5, 12.3, 32.2, 13.5, 23.8, 21.7, 19.2, 19.5, 10.4, 23.2, 18.6, 28.5, 15.2, 32.0, 7.2, 34.6, 20.1, 20.6, 23.6, 13.1, 23.8, 12.7, 43.1, 24.7, 22.2, 44.0, 28.1, 31.0, 21.7, 23.4, 19.5, 33.1,

In [296]:
final_rmse

2.888347288471995