## Dragon Real Estate - Price Predictor


In [1]:
import pandas as pd

In [2]:
housing = pd.read_csv("data.csv")

In [3]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [5]:
housing['CHAS'].value_counts()

0    471
1     35
Name: CHAS, dtype: int64

In [6]:
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284341,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.705587,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.884,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.208,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.625,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [7]:
%matplotlib inline

In [8]:
# For plotting histogram
#import matplotlib.pyplot as plt
#housing.hist(bins = 50, figsize=(20, 15))

## Train-Test Splitting

In [9]:
# for learning purpose
import numpy as np
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    print(shuffled)
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [10]:
#train_set, test_set = split_train_test(housing, 0.2)

In [11]:
#print(f"Rows in train_set: {len(train_set)}\nRows in test_set: {len(test_set)}\n")

In [12]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
print(f"Rows in train_set: {len(train_set)}\nRows in test_set: {len(test_set)}\n")

Rows in train_set: 404
Rows in test_set: 102



In [13]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
     

In [14]:
strat_test_set['CHAS'].value_counts()

0    95
1     7
Name: CHAS, dtype: int64

In [15]:
strat_train_set['CHAS'].value_counts()

0    376
1     28
Name: CHAS, dtype: int64

In [16]:
# 95/7

In [17]:
# 376/28

In [18]:
housing = strat_train_set.copy()

## Looking for correlations

In [19]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.680857
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [20]:
#from pandas.plotting import scatter_matrix
#attributes = ["MEDV", "RM", "ZN", "LSTAT"]
#scatter_matrix(housing[attributes], figsize = (12,8))

In [21]:
#housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8)

## Trying out Attribute Combination

In [22]:
housing["TAXRM"] = housing['TAX']/housing['RM']

In [23]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,TAXRM
254,0.04819,80.0,3.64,0,0.392,6.108,32.0,9.2203,1,315,16.4,392.89,6.57,21.9,51.571709
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.344,4,280,17.0,390.94,5.99,24.5,42.200452
476,4.87141,0.0,18.1,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7,102.714374
321,0.18159,0.0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.9,6.87,23.1,45.012547
326,0.30347,0.0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.9,6.15,23.0,45.468948


In [24]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.680857
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
TAXRM     -0.528626
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [25]:
#housing.plot(kind="scatter", x="TAXRM", y="MEDV", alpha=0.8)

In [26]:
housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

## Missing Attributes

In [27]:
a = housing.dropna(subset=["RM"]) # option 1
a.shape
# note that the original housing dataframe will remain unchanged

(399, 13)

In [28]:
housing.drop("RM", axis = 1).shape # option 2
# note that there is no RM column and also note that the original housing dataframe will remain unchanged

(404, 12)

In [29]:
median= housing["RM"].median() #compute median for option 3

In [30]:
housing["RM"].fillna(median) # option 3
# note that the original housing dataframe will remain unchanged

254    6.108
348    6.635
476    6.484
321    6.376
326    6.312
       ...  
155    6.152
423    6.103
98     7.820
455    6.525
216    5.888
Name: RM, Length: 404, dtype: float64

In [31]:
housing.shape

(404, 13)

In [32]:
housing.describe() #before we started filling missing attributes

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,399.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.279481,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.716784,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.8765,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.209,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.6305,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


In [33]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

SimpleImputer(strategy='median')

In [34]:
imputer.statistics_

array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
       6.20900e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
       1.90000e+01, 3.90955e+02, 1.15700e+01])

In [35]:
x=imputer.transform(housing)

In [36]:
housing_tr = pd.DataFrame(x, columns=housing.columns)

In [37]:
housing_tr.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.278609,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.712366,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.87875,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.209,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.63,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


## Scikit - Learn Design

Primarily, three types of objects
1. Estimators
2. Transformers
3. Predictors

## Feature scaling

Primarily, two types of feature scalinf methods: 
1. Min-max scaling (Normalization)
   (value - min)/(max - min)
   Sklearn provides a class called MinMaxScaler for this
   
2. standardization
   (value - mean)/std
   Sklearn provides a class called Standard Scaler for this 

## Creating a Pipeline

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy= "median")),
    # add as many as you want in your pipeline 
    ('std_scaler',StandardScaler()),
])

In [39]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [40]:
housing_num_tr.shape

(404, 13)

## Selecting a Desired model for Dragon Real Estates

In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

RandomForestRegressor()

In [42]:
 some_data = housing.iloc[:5]

In [43]:
some_labels = housing_labels.iloc[:5]

In [44]:
prepared_data = my_pipeline.transform(some_data)

In [45]:
model.predict(prepared_data)

array([22.323, 25.254, 16.557, 23.477, 23.402])

In [46]:
list(some_labels)

[21.9, 24.5, 16.7, 23.1, 23.0]

## Evaluating the model

In [47]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)


In [48]:
rmse

1.2741318622792719

## Using better evaluation technique - Cross Validation

In [49]:
# 1 2 3 4 5 6 7 8 9 10
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)               

In [50]:
rmse_scores

array([2.74877991, 2.62984112, 4.40733361, 2.581859  , 3.35740749,
       2.55887308, 4.9251642 , 3.29337053, 3.27786671, 3.5032319 ])

In [51]:
def print_scores(scores):
    print("Scores:", scores) 
    print("Mean:", scores.mean())
    print("Standard deviation: ", scores.std())

In [52]:
print_scores(rmse_scores)

Scores: [2.74877991 2.62984112 4.40733361 2.581859   3.35740749 2.55887308
 4.9251642  3.29337053 3.27786671 3.5032319 ]
Mean: 3.3283727546928645
Standard deviation:  0.7564642119280899


## Saving the model

In [53]:
from joblib import dump, load
dump(model, 'Dragon.joblib')

['Dragon.joblib']

## Test  the model on test data

In [60]:
x_test = strat_test_set.drop("MEDV", axis = 1)
y_test = strat_test_set["MEDV"].copy()
x_test_prepared = my_pipeline.transform(x_test)
final_predictions = model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(y_test))

[24.735 11.426 25.332 22.25  17.918 15.168 20.258 14.815 31.402 42.154
 19.645 11.936 23.996 29.21  19.43  10.781 31.125 14.999 23.636 18.552
 19.746 17.664 21.32  22.03  18.511 31.272 16.028 32.889  9.087 32.925
 24.254 21.719 22.867 10.368 20.942 11.528 43.895 24.738 23.312 42.41
 24.118 30.166 20.956 20.794 18.725 33.522 43.905 20.262 20.601 21.647
 22.161 14.675 21.702 15.256 25.421 32.929 42.38  29.245 19.766 21.008
 46.966 10.385 19.182 25.905 14.641 33.675 20.568 18.129 19.119 34.381
 25.87  22.796 22.14  22.427 34.138 13.347 15.895 20.136 20.831 21.541
 22.453 21.134 14.231 22.781 20.309 21.391 13.837 21.342 21.654 23.631
 18.808 27.307  7.249 26.391 19.273 29.352 19.949 31.562 14.206 26.218
 21.403 20.28 ] [16.5, 10.2, 30.1, 23.0, 14.4, 15.6, 19.4, 14.1, 30.3, 35.2, 23.1, 13.8, 25.0, 27.9, 19.5, 12.3, 32.2, 13.5, 23.8, 21.7, 19.2, 19.5, 10.4, 23.2, 18.6, 28.5, 15.2, 32.0, 7.2, 34.6, 20.1, 20.6, 23.6, 13.1, 23.8, 12.7, 43.1, 24.7, 22.2, 44.0, 28.1, 31.0, 21.7, 23.4, 19.5, 33.1,

In [57]:
final_rmse

3.0308544239563333

In [61]:
prepared_data[0]

array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.23979304, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

## Using the model

In [62]:
from joblib import dump,load
import numpy as np
model = load('Dragon.joblib')
features = np.array([[-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.23979304, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034]])
model.predict(features)

array([22.323])