# CatBoost

## Importing the libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#  Make sure all print() lines are printed, not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Make sure matplotlib charts and graphs are displayed in the cell outputs
%matplotlib inline

## Importing the dataset

Normally would clean data, but from CatBoost tutorial:

> Pay attention that our features are of differnt types - some of them are numeric, some are categorical, and some are even just strings, which normally should be handled in some specific way (for example encoded with bag-of-words representation). But in our case we could treat these string features just as categorical one - all the heavy lifting is done inside CatBoost. How cool is that? :)

Also, they deal with missing data by:
>As we cat see, Age, Cabin and Embarked indeed have some missing values, so let's fill them with some number way out of their distributions - so the model would be able to easily distinguish between them and take it into account:



In [2]:
train_df = pd.read_csv("hp_train.csv")  #   NB:  Dependent variable in LAST COLUMN
test_df = pd.read_csv("hp_test.csv")
train_df.head()
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [3]:
# See how much NaN

null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [4]:
# Let's fill them with some number way out of their distributions - so the model would be able to easily distinguish between them and take it into account
# Do for both train and test datasets

train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

## Splitting the dataset into the Training set and Test set

In [5]:
# Separate featrues and label values

# NOTE: dependent variable IN LAST COLUMN

X = train_df.iloc[:, :-1]  #   Note: this differs from Udemy course where .value is added
y = train_df.iloc[:, -1]  #   Note: this differs from Udemy course where .value is added

# NB: convert y into type float
y = y.astype(float)


In [10]:
type(X)

pandas.core.frame.DataFrame

In [9]:
# See data types - categorical, strings, numbers

print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]  # Try float, int as well as object
print(categorical_features_indices)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object
[ 0  1  2  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
 50 51 52 53 54 55 56 57 58 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
 75 76 77 78 79]


In [11]:
from sklearn.model_selection import train_test_split

#  Note difference below from Udemy templates: using X_validation instead of X_test
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=0.75, random_state=42
)



In [None]:
#y_train

## Training CatBoost on the Training set
Create the model itself

In [12]:
# Make necessary imports
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import accuracy_score

# # To show plots
# from ipywidgets import interact  
# import ipywidgets as widgets  

In [13]:
#  Identify columns with categorical features
# categorical_features_indices = np.where(X.dtypes != np.float)[
#      0
# ]  

# specify the training parameters
model = CatBoostRegressor(iterations=50, loss_function='RMSE')

# train the model
model.fit(
    X_train,
    y_train
#     cat_features=categorical_features_indices,
#    eval_set=(X_validation, y_validation),
#    verbose=False,
#    plot=True
)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="RL": Cannot convert 'b'RL'' to float

# Model cross-validation

In [None]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

In [None]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

In [None]:
cv_params = model.get_params()

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

In [None]:
# make the prediction using the resulting model
y_pred = model.predict(y_validation)
print(y_pred)

# R^2 score

In [None]:
from sklearn.metrics import r2_score

r2_score(y_validation, y_pred)

# Make competition submission

In [None]:
# Import data
data = pd.read_csv("hp_test.csv")
# data.info()

In [None]:
# Keep only the features as decided through the EDA
keep_numerical = [
    "GrLivArea",
    "GarageArea",
    "TotalBsmtSF",
    "1stFlrSF",
    "TotRmsAbvGrd",
    "YearBuilt",
    "YearRemodAdd",
]
keep_categorical = [
    "OverallQual",
    "Neighborhood",
    "GarageCars",
    "ExterQual",
    "BsmtQual",
    "KitchenQual",
    "FullBath",
    "GarageFinish",
    "FireplaceQu",
    "Foundation",
    "GarageType",
]
data_subset = data[keep_categorical + keep_numerical]
# data_subset.info()

In [None]:
# qgrid_widget = qgrid.show_grid(pd.DataFrame(data_subset),
#                                show_toolbar=True,
#                               grid_options={'forceFitColumns': False}  #   Many columns in dataframe become unusable for filter without this
#                               )
# qgrid_widget

In [None]:
# Deal with missing data
data_subset.drop(columns="FireplaceQu", inplace=True)

In [None]:
# Encode categorical features
data_subset_enc = pd.get_dummies(data_subset)

In [None]:
# qgrid_widget = qgrid.show_grid(pd.DataFrame(data_subset_enc),
#                                show_toolbar=True,
#                               grid_options={'forceFitColumns': False}  #   Many columns in dataframe become unusable for filter without this
#                               )
# qgrid_widget

In [None]:
# data_subset_enc.info()

In [None]:
#  Now prepare for competition submission
#####  Confirm why first column is dropped ##########
X_competition = data_subset_enc.iloc[:, :].values

In [None]:
np.shape(X_competition)

In [None]:
# Now only have numeric data, so impute missing values

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(X_competition)
X_competition = imputer.transform(X_competition)
# np.shape(X_competition)

In [None]:
y_competition = sc_y.inverse_transform(
    classifier.predict(sc_X.transform(X_competition))
)
# np.shape(y_competition)

In [None]:
# dataset.iloc[0:10, 0]

In [None]:
# Concatenate prediction and passenger ID columns
# submission = pd.DataFrame(pd.concat([dataset.iloc[:, 0],  y_competition.iloc[:, :]], axis=1))
submission = pd.DataFrame(y_competition)

In [None]:
submission.columns = ["SalePrice"]
IdColumn = pd.read_csv("hp_test.csv")
# submission = pd.concat(IdColumn["Id"], submission["SalePrice"])
# submission
submission = pd.concat([IdColumn["Id"], submission["SalePrice"]], axis=1)
# print(submission)

In [None]:
# Write to file
submission.to_csv("hp_catboost.csv", index=False)

## Applying k-Fold Cross Validation

In [None]:
# from sklearn.model_selection import cross_val_score

# accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)
# print("Accuracy: {:.2f} %".format(accuracies.mean() * 100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std() * 100))