# Housing Price Calculation

### Import Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline
data = pd.read_csv("housing.csv")
# data.describe()

### Imputer

In [2]:
def impute(data):
    
    imputer = SimpleImputer(strategy='median')
    modified_data = imputer.fit_transform(data)
    return pd.DataFrame(modified_data,columns=data.columns)

### Correlations

In [3]:
def correlate(data,lable):
    corr_matrix = data.corr()
    return corr_matrix[lable].sort_values(ascending=False)

### Train-Test Splitting

In [4]:
def train_test_split(data,lable):
    split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data,data[lable]):
        train_set = data.loc[train_index]
        test_set = data.loc[test_index]
    return train_set, test_set


### Pipelining

In [5]:
pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy='median')),
    ("std_scalar",StandardScaler())
])

### Cross-Validation

In [6]:
def cross_validate(model,data,scoring="neg_mean_squared_error",cv_folds=10):
    from sklearn.model_selection import cross_val_score
    score = cross_val_score(model,data_np,labels,scoring=scoring, cv=cv_folds)
    score =  np.sqrt(-score)
    print(f"{type(model)}  : {score.mean()+ score.std()}")

### Model Selection


In [7]:
def model_lr(data,labels):
    lr_model = LinearRegression()
    lr_model.fit(data,labels)
    a = cross_validate(lr_model,data)
    return lr_model
    
def model_dtr(data,labels):
    dtr_model = DecisionTreeRegressor()
    dtr_model.fit(data,labels)
    a = cross_validate(dtr_model,data)
    return dtr_model
    
def model_rfr(data,labels):
    rfr_model = RandomForestRegressor()
    rfr_model.fit(data,labels)
    a = cross_validate(rfr_model,data)
    return rfr_model
    
def search_model(data,labels):
    model_lr (data,labels)
    model_dtr(data,labels)
    model_rfr(data,labels)


### Working

In [8]:
data_tr, test_set = train_test_split(data,"ocean_proximity")
labels = data_tr["median_house_value"].copy()

data_tr = data_tr.drop("median_house_value",axis=1)
# data_tr = data_tr.drop(un_wanted_columns,axis=1)

data_np = pipeline.fit_transform(data_tr)

In [10]:
model = model_rfr(data_np,labels)

<class 'sklearn.ensemble._forest.RandomForestRegressor'>  : 64152.07185590742


## Testing

In [11]:
X_test = test_set.drop("median_house_value",axis=1)

processed_X_test = pipeline.fit_transform(X_test)

y_test = test_set["median_house_value"].copy()

# cross_validate(model,test_set)


In [12]:
# predicted_y = model.predict(processed_X_test)
# for i in range(len(predicted_y)):
    
#     print(predicted_y[i-1:i], "  ->",list(y_test[i-1:i]) , f"{predicted_y[i-1:i] - list(y_test[i-1:i])}" )


In [13]:
cross_validate(model,test_set,cv_folds=5)

<class 'sklearn.ensemble._forest.RandomForestRegressor'>  : 63817.24336733636


## Saving Data

In [14]:
from joblib import dump

In [15]:
dump(model,"median_house_value_rfr_predictor_err_63652USD_.joblib")

['median_house_value_rfr_predictor_err_63652USD_.joblib']

## Final Testing

In [16]:
from joblib import load
loaded_mdl = load("median_house_value_rfr_predictor_err_63652USD_.joblib")

In [17]:
somedata = data[400:450]

In [18]:
somedata_X = somedata.drop("median_house_value",axis=1)
somedata_tr_X = pipeline.fit_transform(somedata_X)
somedata_Y = test_set["median_house_value"].copy()

In [19]:
predictions = loaded_mdl.predict(somedata_tr_X)


In [20]:
for i in range(len(predictions)):
    test_data = ""
    test_data += f"\n\n{i}:Predicted: {predictions[i:i+1]},   Original: {list(somedata_Y[i:i+1])}"
    test_data += f"Difference: {predictions[i:i+1] - list(somedata_Y[i:i+1])} "

    with open("test_data.txt","a") as f:
        f.write(test_data)

    