# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import seaborn as sns

In [2]:
#load data

In [3]:
diamonds=sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
diamonds.shape

(53940, 10)

In [5]:
diamonds.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [6]:
#basic eda
#describe
#outlier
#convert categorical to numerical features
#try scaling

In [7]:
#selecting 1st 15000 values for further steps
df=diamonds.copy()
#df=diamonds.iloc[0:15000:]
df.shape

(53940, 10)

In [8]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [9]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [10]:
#seperate data into features and target

X=df.drop(["cut","color","clarity","price"],axis=1)   # dropping categorical features for  timesake 
y=df["price"]

In [11]:
X.shape

(53940, 6)

In [12]:
y.shape

(53940,)

In [13]:
# split the data in training and testing set

X_train, X_test, y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)

In [14]:
print("X_train shape : " , X_train.shape)
print("X_test shape : " , X_test.shape)
print("y_train shape : " , y_train.shape)
print("y_test shape : " , y_test.shape)

X_train shape :  (37758, 6)
X_test shape :  (16182, 6)
y_train shape :  (37758,)
y_test shape :  (16182,)


In [15]:
#unique_values = y_train.unique()
#y_train = y_train.map({val: i for i, val in enumerate(unique_values)})

### Another code
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [16]:
#fit model on training data
model=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1)
model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy=None, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=0, max_depth=6, max_leaves=None, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=4, num_parallel_tree=1, predictor=None, random_state=0, ...)

In [17]:
#make predictions for test data

y_pred=model.predict(X_test)
print(y_pred)

[ 150.92831 1396.7736   765.8309  ... 7987.02    2320.237    670.1559 ]


In [18]:
#MSE
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))

MSE:  4378890.669240203


In [19]:
#RMSE
rmse=np.sqrt(metrics.mean_squared_error(y_test,y_pred))
print(rmse)

2092.579907492233


### Apply xgboost on classification problem

In [20]:
df_1=pd.read_csv("diabetes.csv")
df_1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
# seperate out features and target value from datsaset

X_1=df_1.drop(["Outcome"],axis=1).values
y_1=df_1["Outcome"].values

In [22]:
#split the data in training and testing set

X_train_1,X_test_1,y_train_1,y_test_1=train_test_split(X_1,y_1,test_size=0.25,random_state=42)

In [23]:
print("X_train shape : ",X_train_1.shape)
print("X_test shape : ",X_test_1.shape)
print("y_train shape : ",y_train_1.shape)
print("y_test shape : ",y_test_1.shape)

X_train shape :  (576, 8)
X_test shape :  (192, 8)
y_train shape :  (576,)
y_test shape :  (192,)


In [24]:
#fit model on training data
xgb_clf=XGBClassifier()

In [25]:

xgb_clf.fit(X_train_1,y_train_1)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [26]:
#make predictions for test data

y_pred_1=xgb_clf.predict(X_test_1)
print(y_pred)

[ 150.92831 1396.7736   765.8309  ... 7987.02    2320.237    670.1559 ]


In [27]:
acc_xgb=metrics.accuracy_score(y_test_1,y_pred_1)
print("Accuracy: ",acc_xgb)

Accuracy:  0.75


# Xgboost on House Prediction

In [28]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [29]:
boston = pd.DataFrame(boston_dataset.data,
                      columns = boston_dataset.feature_names)
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [30]:
#target variable

boston['MEDV']=boston_dataset.target

In [31]:
X1=pd.DataFrame(boston.iloc[:,:-1])
y1=pd.DataFrame(boston.iloc[:,-1])
print("Features set: ",X1.shape)
print("Target: ",y1.shape)

Features set:  (506, 13)
Target:  (506, 1)


In [32]:
#splitting data into training and testing test

X_train1,X_test1,y_train1,y_test1=train_test_split(X1,y1,test_size=0.20,random_state=42)
print(X_train1.shape)
print(X_test1.shape)
print(y_train1.shape)
print(y_test1.shape)


(404, 13)
(102, 13)
(404, 1)
(102, 1)


In [33]:
#fit model on training data

model1=XGBRegressor()
model1.fit(X_train1,y_train1)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [34]:
#make predictions for test data

y_pred1=model1.predict(X_test1)
print(y_pred1)

[23.25328   30.024755  15.632249  23.313478  17.775118  21.142563
 20.19583   15.010124  21.23614   22.242369  20.457346  19.209145
  8.551788  21.210636  20.696491  26.74365   18.824339  10.525872
 45.68885   14.116162  26.618996  24.94542   13.3510275 20.87231
 15.400073  15.636547  22.324673  12.777009  20.726126  22.56401
 20.346395  22.303246  18.523277  21.764612  15.568828  15.683646
 33.073547  19.115112  21.955132  22.399914  18.998787  31.328337
 43.464993  18.20766   22.09233   14.353467  14.607512  22.716745
 19.700527  27.072327  22.579268  35.133675  16.241447  25.214682
 46.013332  21.89786   15.043295  32.93268   20.53731   16.568089
 24.07178   34.34796   28.542194  16.977676  25.867334  15.649837
 13.039615  23.00082   27.26897   15.414835  21.546648  31.72919
 10.665012  20.770847  21.848396   6.475782  20.939093  46.59454
 12.456056   8.739085  22.215406  13.390212  20.454681  10.45914
 19.722834  27.327946  16.254663  23.860172  25.414312  17.06042
 22.9362     8.1

In [35]:
#Mean absolute error
print("MAE: ",metrics.mean_absolute_error(y_test1,y_pred1))

#Mean squared error
print("MSE: ",metrics.mean_squared_error(y_test1,y_pred1))

MAE:  1.9574996602301502
MSE:  6.560527271813469
