### 4. Modeling

In [None]:
from pathlib import Path
import pandas as pd

path = Path("./modeling.ipynb")
abs_path = str(path.parent.absolute())
index = [i for i, e in enumerate(abs_path) if e == "\\"]      
parent_path = abs_path[:index[-2]]
train_path = parent_path + "/data/processed/train.csv"
test_path = parent_path + "/data/processed/test.csv"

data = pd.read_csv(train_path)
final_test = pd.read_csv(test_path)
y = data['SalePrice']
test_id = final_test['Id']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.05, random_state = 42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

tf_imputer = ColumnTransformer([("imputer", SimpleImputer(missing_values=np.nan, strategy = 'mean'), [0,1,2,3,4,5,6])])

pre_pipeline = Pipeline(steps = ([("tf_imputer", tf_imputer),
                                ('scaler', MinMaxScaler(feature_range = (0,1)))]))

Methods are considered:
1. K-nearest-neighbors, 
2. Decision tree, 
3. Random forest, 
4. 2 SVM methods (using a polynomial kernel and a Gaussian kernel),
5. 2 (deep) neural networks with Sigmoid activation and ReLu activation functions
6. Xgboost

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import clone

## 1. KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_base = clone(pre_pipeline)
knn_base.steps.append(('knn', KNeighborsClassifier()))
knn_base.fit(X_train, y_train)

y_pred = knn_base.predict(X_test)
r2_base = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Performance for KNN baseline model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2_base))
print('MAE is {}'.format(mae))

In [None]:
pipeline = clone(pre_pipeline)
pipeline.steps.append(('knn', KNeighborsClassifier()))
param_dist = {"knn__n_neighbors": list(np.linspace(2, 7, 6, dtype = int)),
              "knn__leaf_size": list(np.linspace(5, 150, 5, dtype = int)),
              "knn__weights": ["uniform", "distance"],
              "knn__p": [1, 2, 3]}
random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = param_dist, n_jobs = -1, verbose = 2, cv = 3)
random_search.fit(X_train, y_train)
knn_improve = random_search.best_estimator_

y_pred = knn_improve.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
improve =  (r2 - r2_base) / r2_base
print("Performance for KNN tuned model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2))
print('MAE is {}'.format(mae))
print('Improvement of R2 score: {:0.2f}%.'.format( 100 * improve))

In [None]:
random_search.best_params_

## 2. Decision tree

In [None]:
from sklearn import tree
dt_base = clone(pre_pipeline)
dt_base.steps.append(('dt', tree.DecisionTreeClassifier(random_state = 42)))
dt_base.fit(X_train, y_train)

y_pred = dt_base.predict(X_test)
r2_base = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Performance for Decision Tree baseline model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2_base))
print('MAE is {}'.format(mae))

In [None]:
pipeline = clone(pre_pipeline)
pipeline.steps.append(('dt', tree.DecisionTreeClassifier(random_state = 42)))
param_dist = {"dt__criterion": ["gini", "entropy"],
              "dt__splitter": ["best", "random"],
              "dt__max_features": ["auto", "sqrt", "log2", None]}
random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = param_dist, n_jobs = -1, verbose = 2, cv = 3)
random_search.fit(X_train, y_train)
dt_improve = random_search.best_estimator_

y_pred = dt_improve.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
improve =  (r2 - r2_base) / r2_base
print("Performance for Decision Tree tuned model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2))
print('MAE is {}'.format(mae))
print('Improvement of R2 score: {:0.2f}%.'.format( 100 * improve))

In [None]:
random_search.best_params_

## 3. Random forest

In [None]:
from sklearn import ensemble
rf_base = clone(pre_pipeline)
rf_base.steps.append(('rf', ensemble.RandomForestClassifier(random_state = 42)))
rf_base.fit(X_train, y_train)

y_pred = rf_base.predict(X_test)
r2_base = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Performance for Random Forest baseline model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2_base))
print('MAE is {}'.format(mae))

In [None]:
pipeline = clone(pre_pipeline)
pipeline.steps.append(('rf', ensemble.RandomForestClassifier(random_state = 42)))
param_dist = {"rf__n_estimators": list(np.linspace(10, 2000, 20, dtype = int)),
              "rf__criterion": ["gini", "entropy"],
              "rf__max_depth": [3,4,5,6,7,8,9,10,11,12,13,None],
              "rf__max_features": ["auto", "sqrt", "log2"]}
random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = param_dist, n_jobs = -1, verbose = 2, cv = 3)
random_search.fit(X_train, y_train)
rf_improve = random_search.best_estimator_

y_pred = rf_improve.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
improve =  (r2 - r2_base) / r2_base
print("Performance for Random Forest tuned model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2))
print('MAE is {}'.format(mae))
print('Improvement of R2 score: {:0.2f}%.'.format( 100 * improve))

In [None]:
random_search.best_params_

## 4. SVM

In [None]:
from sklearn.svm import SVC
svm_base = clone(pre_pipeline)
svm_base.steps.append(('svm', SVC(random_state = 42)))
svm_base.fit(X_train, y_train)

y_pred = svm_base.predict(X_test)
r2_base = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Performance for SVM baseline model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2_base))
print('MAE is {}'.format(mae))

In [None]:
pipeline = clone(pre_pipeline)
pipeline.steps.append(('svm', SVC(random_state = 42)))
param_dist = {"svm__kernel": ["poly", "rbf"],
              "svm__degree": [2,3,4,5],
              "svm__C": list(np.linspace(1.0, 20.0, 5, dtype = float)),
              "svm__gamma": ["scale", "auto"] }
random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = param_dist, n_jobs = -1, verbose = 2, cv = 3)
random_search.fit(X_train, y_train)
svm_improve = random_search.best_estimator_

y_pred = svm_improve.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
improve =  (r2 - r2_base) / r2_base
print("Performance for SVM tuned model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2))
print('MAE is {}'.format(mae))
print('Improvement of R2 score: {:0.2f}%.'.format( 100 * improve))

In [None]:
random_search.best_params_

## 5. Neural network

In [None]:
from sklearn.neural_network import MLPClassifier
nn_base = clone(pre_pipeline)
nn_base.steps.append(('nn', MLPClassifier(random_state = 42)))
nn_base.fit(X_train, y_train)

y_pred = nn_base.predict(X_test)
r2_base = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Performance for Neural Network baseline model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2_base))
print('MAE is {}'.format(mae))

In [None]:
pipeline = clone(pre_pipeline)
pipeline.steps.append(('nn', MLPClassifier(random_state = 42)))
param_dist = {"nn__hidden_layer_sizes": [(100,),(100,5),(100,2)],
              "nn__activation":["logistic", "relu"],
              "nn__solver": ["lbfgs", "sgd", "adam"],
              "nn__learning_rate": ["constant", "invscaling", "adaptive"],
              "nn__max_iter": list(np.linspace(100, 1500, 10, dtype = int)),
              "nn__early_stopping": [True, False]}
random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = param_dist, n_jobs = -1, verbose = 2, cv = 5)
random_search.fit(X_train, y_train)
nn_improve = random_search.best_estimator_

y_pred = nn_improve.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
improve =  (r2 - r2_base) / r2_base
print("Performance for SVM tuned model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2))
print('MAE is {}'.format(mae))
print('Improvement of R2 score: {:0.2f}%.'.format( 100 * improve))

In [None]:
random_search.best_params_

## 6. Xgboost

XGBoost is an open-source software library which provides a regularizing gradient boosting framework for C++, Java, Python, R, Julia, Perl, and Scala.

According to answers from the question: [Is it necessary to scale the target value in addition to scaling features for regression analysis?](https://stats.stackexchange.com/questions/111467/is-it-necessary-to-scale-the-target-value-in-addition-to-scaling-features-for-re), I may try normalization.

In [None]:
import xgboost as XGB
xgb_base = clone(pre_pipeline)
xgb_base.steps.append(('xgb', XGB.XGBRegressor(random_state = 42)))
xgb_base.fit(X_train, y_train)

y_pred = xgb_base.predict(X_test)
r2_base = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Performance for Xgboost baseline model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2_base))
print('MAE is {}'.format(mae))

In [None]:
pipeline = clone(pre_pipeline)
pipeline.steps.append(('xgb', XGB.XGBRegressor(random_state = 42)))
param_dist = {"xgb__n_estimators": list(np.linspace(10, 3000, 20, dtype = int)),
              "xgb__max_depth":list(np.linspace(5, 30, 25, dtype = int)),
              "xgb__learning_rate": list(np.linspace(0.0, 1.0, 40, dtype = float)),
              "xgb__gamma": list(np.linspace(0.0, 10.0, 40, dtype = float))}
random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = param_dist, n_jobs = -1, verbose = 2, cv = 3)
random_search.fit(X_train, y_train)
xgb_improve = random_search.best_estimator_

y_pred = xgb_improve.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
improve =  (r2 - r2_base) / r2_base
print("Performance for Xgboost tuned model is:")
print("---------------------------------------------")
print('R2 score is {}'.format(r2))
print('MAE is {}'.format(mae))
print('Improvement of R2 score: {:0.2f}%.'.format( 100 * improve))

In [None]:
random_search.best_params_

# Evaluation

In [None]:
compare_table = {"KNN": [0.418, 0.6966016612071357, 0.8092504113885084], 
                 "Decision Tree": [0.52, 0.6856229361149115, 0.6789868125230183],
                 "Random Forest": [185, 0.766357912256961, 0.8158580683249272],
                 "SVM": [20.2, 0.5395299138520958, 0.5105028066065995], 
                 "Neural Network": [525, 0.7474809530041818, 0.861390335441664],
                 "Xgboost": [123, 0.8821370550045167, 0.8885990109061078]}

In [None]:
df_cmp = pd.DataFrame(compare_table, dtype=float, 
            index = ["Time cost for Randomized Search (seconds)", "R2 score for baseline model", "R2 score for tuned model"])
df_cmp.T

In [None]:
df_cmp.T["Time cost for Randomized Search (seconds)"].plot.bar(figsize=(10,8), rot=0, title="Time cost for Randomized Search (seconds)")

In [None]:
df_cmp.T[["R2 score for baseline model", "R2 score for tuned model"]].plot.bar(figsize=(12,12), rot=0, title="R2 score for models with/without tuning")

<font size=4>
        Combine both figures above, there're two choices after balancing time cost and prediction accuarcy:<br>
            &nbsp;&nbsp;&nbsp;&nbsp;1. KNN: more time saving <br>
            &nbsp;&nbsp;&nbsp;&nbsp;2. Xgboost: more accurate prediction <br>
</font>

In [None]:
y_predict = xgb_improve.predict(final_test)
output = pd.DataFrame()
output['Id'] = test_id
output['SalePrice'] = y_predict
output_path = parent_path + "/data/output/submission.csv"
output.to_csv(output_path, index=False)