# XGBoost

In [24]:
import xgboost as xgb
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
data = pd.read_csv("../../../Data/Dataset_zero_numeric_missing.csv",sep=";")

### Get feature categories

In [27]:
def categorisation(data,target="price"):
    distance = []
    numerique = []
    categorique = []
    for col in data.columns:
        if data[col].dtypes == np.object:
            try:
                if str(type(eval(data[col][data[col].first_valid_index()]))) == "<class 'list'>":
                    distance.append(col)
                else:
                    categorique.append(col)
            except:
                categorique.append(col)
                
            
        else :
            if col != target:
                numerique.append(col)
                
    return dict({"numerique":numerique,"categorique":categorique,"distance":distance})

feat_cat = categorisation(data)

feat_cat

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data[col].dtypes == np.object:


{'numerique': ['bathrooms', 'bedrooms', 'sampling'],
 'categorique': ['name', 'province', 'city', 'address', 'type_of_property'],
 'distance': ['education', 'transport_and_public_services']}

### Dropping name, address and city because there is high cardinality (cf EDA)

In [28]:
data= data.drop(['name','address','city'],axis=1)
data.columns

Index(['price', 'province', 'type_of_property', 'bathrooms', 'bedrooms',
       'education', 'transport_and_public_services', 'sampling'],
      dtype='object')

### Transform distance features to lenght of list

In [29]:
class transfo_distance:
    def __init__(self,data,features=None):
        self.data = data
        self.method = None
        ### Get features
        self.distance=[]
        if features is None:
            for col in data.columns:
                if data[col].dtypes == np.object:
                    try:
                        if str(type(eval(data[col][data[col].first_valid_index()]))) == "<class 'list'>":
                            self.distance.append(col)
                    except:
                        pass
        else:
            distance=features
        
        ###Transformations : "mean", "median", "max", "min", "len"
    def transformation(self,method):
        self.method = method
        for feat in self.distance:
            serie = []
            for i in data[feat]:
                if str(i) != "nan":
                    serie.append(self.checker(i))
                else :
                    serie.append(np.nan)
            self.data[feat] = serie
            
    def checker(self,value):
        if self.method=="mean":
            return (np.array(eval(value))).mean()
        elif self.method =="median":
            return (np.array(eval(value))).mediane()
        elif self.method =="min":
            return (np.array(eval(value))).min()
        elif self.method =="max":
            return (np.array(eval(value))).max()
        elif self.method =="len":
            return float(len(eval(value)))
        else:
            raise('choose correct method : "mean", "median", "max", "min", "len"')
    
    def get_data(self,method="mean"):
        self.transformation(method)
        return self.data

In [30]:
trans_dist = transfo_distance(data)
data = trans_dist.get_data(method="len")
data.shape

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data[col].dtypes == np.object:


(9445, 8)

In [31]:
data= data[~(data['type_of_property'] == 'Vacant Land / Plot')]
data.shape

(9444, 8)

## Encoding cadegorical data

### Using LabelEncoder

In [7]:
def label_encoding(data,labels):
    for label in labels:
        label_encoder= LabelEncoder()
        data[label]=label_encoder.fit_transform(data[label])
    return data

### Using dummies

In [9]:
def using_dummies(data):
    return pd.get_dummies(data)

### Using custom label encoding

In [12]:
def label_custom_encoding(data,labels):
    for label in labels:
        values= data[label].unique()
        for value in values:
            code= len(data[data[label]==value])
            data.loc[data[label]==value,label]=int(code)
        data[label]=pd.to_numeric(data[label])
    return data

### target mean ordering encoding of categorical value

In [13]:
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'price']].groupby(feature).mean()['price']
    #print(frame[[feature, 'a']].groupby(feature).mean()['a'])
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    #print(ordering)
    ordering = ordering['ordering'].to_dict()
    #print(ordering)
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o

In [32]:
# data= using_dummies(data)

qual_encoded = []
feat_cat = categorisation(data)
for q in feat_cat["categorique"]:  
    encode(data, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)


# data= label_custom_encoding(data,feat_cat['categorique'])

# data= label_encoding(data,feat_cat['categorique'])


['province_E', 'type_of_property_E']


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data[col].dtypes == np.object:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


### Sampling

In [33]:
def split_data(data,target,col="sampling",features_to_keep=None):
    if features_to_keep is not None :
        features_to_keep = features_to_keep + [target]
        data = data[features_to_keep]
        
    unique = data["sampling"].unique()
    for i, v in enumerate(unique):
        if v == 1 :
            data_train = data[data[col]==v]
            y_train = data_train[target]
            X_train = data_train.drop([target,"sampling"],axis=1)
            print("train shape :",X_train.shape)
        elif v == 2 :
            data_val = data[data[col]==v]
            y_val = data_val[target]
            X_val = data_val.drop([target,"sampling"],axis=1)
            print("val shape :",X_val.shape)
        elif v == 3 :
            data_test = data[data[col]==v]
            y_test = data_test[target]
            X_test = data_test.drop([target,"sampling"],axis=1)
            print("test shape :",X_test.shape)
    
            
    return X_train, np.log(y_train), X_val, np.log(y_val), X_test, np.log(y_test)

In [35]:
feats_to_keep =[ 'bathrooms', 'bedrooms', 'education',
       'transport_and_public_services', 'province_E','sampling','type_of_property_E']
X_train, y_train, X_val, y_val, X_test, y_test = split_data(data,"price",features_to_keep=feats_to_keep)
# X_train, y_train, X_val, y_val, X_test, y_test = split_data(data,"price")
data.shape

train shape : (6571, 6)
val shape : (1465, 6)
test shape : (1408, 6)


(9444, 10)

In [36]:
reg = xgb.XGBRegressor(n_jobs=2)
reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=2, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
y_prediction =  reg.predict(X_val)
y_prediction



array([12.967912, 14.465218, 14.601284, ..., 14.248821, 15.149316,
       14.229378], dtype=float32)

In [38]:
rmse = np.sqrt(mean_squared_error(y_val, reg.predict(X_val)))
print("The Root mean squared error (RMSE) on val set: {:.4f}".format(rmse))
rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
print("The Root mean squared error (RMSE) on train set: {:.4f}".format(rmse))

The Root mean squared error (RMSE) on val set: 0.4346
The Root mean squared error (RMSE) on train set: 0.3330




In [39]:
print("Model : RF")

RF = RandomForestRegressor()
RF.fit(X_train, y_train)

rmse = np.sqrt(mean_squared_error(y_val, RF.predict(X_val)))
print("The Root mean squared error (RMSE) on val set: {:.4f}".format(rmse))
rmse = np.sqrt(mean_squared_error(y_train, RF.predict(X_train)))
print("The Root mean squared error (RMSE) on train set: {:.4f}".format(rmse))

print("Model : RidgeCV")

regCV = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
regCV.fit(X_train, y_train)

rmse = np.sqrt(mean_squared_error(y_val, regCV.predict(X_val)))
print("The Root mean squared error (RMSE) on val set: {:.4f}".format(rmse))
rmse = np.sqrt(mean_squared_error(y_train, regCV.predict(X_train)))
print("The Root mean squared error (RMSE) on train set: {:.4f}".format(rmse))

print("Model : SVR")

regr = SVR(C=1.0,epsilon=0.2)
regr.fit(X_train, y_train)

rmse = np.sqrt(mean_squared_error(y_val, regr.predict(X_val)))
print("The Root mean squared error (RMSE) on val set: {:.4f}".format(rmse))
rmse = np.sqrt(mean_squared_error(y_train, regr.predict(X_train)))
print("The Root mean squared error (RMSE) on train set: {:.4f}".format(rmse))

Model : RF
The Root mean squared error (RMSE) on val set: 0.4480
The Root mean squared error (RMSE) on train set: 0.2525
Model : RidgeCV
The Root mean squared error (RMSE) on val set: 0.5054
The Root mean squared error (RMSE) on train set: 0.5283
Model : SVR
The Root mean squared error (RMSE) on val set: 0.4576
The Root mean squared error (RMSE) on train set: 0.4744
