# Housing Price Calculation

### Import Data

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline
data = pd.read_csv("data/housing.csv")
data.describe()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
count,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,65.134302,206855.816909
std,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,18.795022,115395.615874
min,1.0,2.0,1.0,3.0,1.0,0.4999,30.0,14999.0
25%,18.0,1447.75,296.0,787.0,280.0,2.5634,40.0,119600.0
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,70.0,179700.0
75%,37.0,3148.0,647.0,1725.0,605.0,4.74325,70.0,264725.0
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,90.0,500001.0


### Imputer

In [2]:
def impute(data):
    
    imputer = SimpleImputer(strategy='median')
    modified_data = imputer.fit_transform(data)
    return pd.DataFrame(modified_data,columns=data.columns)

### Correlations

In [3]:
def correlate(data,lable):
    corr_matrix = data.corr()
    return corr_matrix[lable].sort_values(ascending=False)

### Train-Test Splitting

In [4]:
def train_test_split(data,lable):
    split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data,data[lable]):
        train_set = data.loc[train_index]
        test_set = data.loc[test_index]
    return train_set, test_set


### Pipelining

In [5]:
pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy='median')),
    ("std_scalar",StandardScaler())
])

### Cross-Validation

In [7]:
def cross_validate(model,data,scoring="neg_mean_squared_error",cv_folds=10):
    from sklearn.model_selection import cross_val_score
    score = cross_val_score(model,data_np,labels,scoring=scoring, cv=cv_folds)
    score =  np.sqrt(-score)
    print(f"{type(model)}  : Mean: {score.mean()}, STD: { score.std()}")

### Model Selection


In [8]:
def model_lr(data,labels):
    lr_model = LinearRegression()
    lr_model.fit(data,labels)
    a = cross_validate(lr_model,data)
    return lr_model
    
def model_dtr(data,labels):
    dtr_model = DecisionTreeRegressor()
    dtr_model.fit(data,labels)
    a = cross_validate(dtr_model,data)
    return dtr_model
    
def model_rfr(data,labels):
    rfr_model = RandomForestRegressor()
    rfr_model.fit(data,labels)
    a = cross_validate(rfr_model,data)
    return rfr_model
    
def search_model(data,labels):
    model_lr (data,labels)
    model_dtr(data,labels)
    model_rfr(data,labels)


### Working

In [10]:
data_tr, test_set = train_test_split(data,"ocean_proximity")
labels = data_tr["median_house_value"].copy()

data_tr = data_tr.drop("median_house_value",axis=1)
# data_tr = data_tr.drop(un_wanted_columns,axis=1)

data_np = pipeline.fit_transform(data_tr)

In [11]:
# Search for the best model
search_model(data_np,labels)

# Train the best model
model = model_rfr(data_np,labels)

<class 'sklearn.linear_model._base.LinearRegression'>  : Mean: 70501.7915738301, STD: 2223.466320902556
<class 'sklearn.tree._classes.DecisionTreeRegressor'>  : Mean: 84250.34865245849, STD: 1771.8570166399138
<class 'sklearn.ensemble._forest.RandomForestRegressor'>  : Mean: 61581.85714844307, STD: 1394.5165473192421
<class 'sklearn.ensemble._forest.RandomForestRegressor'>  : Mean: 61614.71063338956, STD: 1536.9591250187436


## Testing

In [13]:
X_test = test_set.drop("median_house_value",axis=1)

processed_X_test = pipeline.fit_transform(X_test)

y_test = test_set["median_house_value"].copy()

cross_validate(model,test_set,cv_folds=3)


<class 'sklearn.ensemble._forest.RandomForestRegressor'>  : Mean: 62070.384364284204, STD: 692.5778426679464


## Saving Data

In [14]:
from joblib import dump

In [15]:
dump(model,"data/median_house_value_rfr_predictor.joblib")

['median_house_value_rfr_predictor.joblib']

## Final Testing

In [16]:
from joblib import load
loaded_mdl = load("median_house_value_rfr_predictor.joblib")

In [17]:
somedata = data[400:450]

In [18]:
somedata_X = somedata.drop("median_house_value",axis=1)
somedata_tr_X = pipeline.fit_transform(somedata_X)
somedata_Y = test_set["median_house_value"].copy()

In [19]:
predictions = loaded_mdl.predict(somedata_tr_X)


In [21]:
test_data = ""
for i in range(len(predictions)):
    test_data += f"[{int(somedata_Y.iloc[i] - predictions[i]) }\t]\t\t Actual: {somedata_Y.iloc[i]} \t\tPredicted: {predictions[i]}\n"
    test_data += "\n\n"

with open("data/test_data.txt","w") as f:
    f.write(test_data)
    