In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , FunctionTransformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

In [2]:
df_desc = pd.read_csv("data/key.csv", index_col = 0)
df = pd.read_csv("data/data_clean.csv")
df = df.drop("Unnamed: 0",axis=1)

In [3]:
df.dtypes

funder                     object
gps_height                  int64
installer                  object
wpt_name                   object
basin                      object
region_code                 int64
district_code               int64
population                  int64
public_meeting            float64
permit                    float64
extraction_type_class      object
management                 object
payment_type               object
quality_group              object
quantity                   object
source                     object
source_class               object
waterpoint_type            object
status_group               object
construction_year_bins     object
dtype: object

In [4]:
df

Unnamed: 0,funder,gps_height,installer,wpt_name,basin,region_code,district_code,population,public_meeting,permit,extraction_type_class,management,payment_type,quality_group,quantity,source,source_class,waterpoint_type,status_group,construction_year_bins
0,other,1390,other,none,Lake Nyasa,11,5,109,1.0,0.0,gravity,vwc,annually,good,enough,spring,groundwater,communal standpipe,functional,"(1990.0, 2000.0]"
1,other,1399,other,Zahanati,Lake Victoria,20,2,280,,1.0,gravity,wug,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,"(2000.0, 2010.0]"
2,other,686,other,other,Pangani,21,4,250,1.0,1.0,gravity,vwc,per bucket,good,enough,dam,surface,communal standpipe multiple,functional,"(2000.0, 2010.0]"
3,other,263,other,other,Ruvuma / Southern Coast,90,63,58,1.0,1.0,submersible,vwc,never pay,good,dry,machine dbh,groundwater,communal standpipe multiple,non functional,"(1980.0, 1990.0]"
4,other,0,other,Shuleni,Lake Victoria,18,1,0,1.0,1.0,gravity,other,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,"(1980.0, 1990.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,other,1210,other,other,Pangani,3,5,125,1.0,1.0,gravity,water board,per bucket,good,enough,spring,groundwater,communal standpipe,functional,"(1990.0, 2000.0]"
59396,other,1212,other,other,Rufiji,11,4,56,1.0,1.0,gravity,vwc,annually,good,enough,river,surface,communal standpipe,functional,"(1990.0, 2000.0]"
59397,other,0,other,other,Rufiji,12,7,0,1.0,0.0,handpump,vwc,monthly,fluoride,enough,machine dbh,groundwater,hand pump,functional,"(1980.0, 1990.0]"
59398,other,0,other,other,Rufiji,1,4,0,1.0,1.0,handpump,vwc,never pay,good,insufficient,shallow well,groundwater,hand pump,functional,"(1980.0, 1990.0]"


In [5]:
def int_to_object(df):
    df["region_code"] = df["region_code"].astype(object)
    df["district_code"] = df["district_code"].astype(object)
    df["public_meeting"] = df["public_meeting"].astype(str)
    df["permit"] = df["permit"].astype(str)
    df["construction_year_bins"] = df["construction_year_bins"].astype(str)
    return df.dtypes

In [6]:
df

Unnamed: 0,funder,gps_height,installer,wpt_name,basin,region_code,district_code,population,public_meeting,permit,extraction_type_class,management,payment_type,quality_group,quantity,source,source_class,waterpoint_type,status_group,construction_year_bins
0,other,1390,other,none,Lake Nyasa,11,5,109,1.0,0.0,gravity,vwc,annually,good,enough,spring,groundwater,communal standpipe,functional,"(1990.0, 2000.0]"
1,other,1399,other,Zahanati,Lake Victoria,20,2,280,,1.0,gravity,wug,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,"(2000.0, 2010.0]"
2,other,686,other,other,Pangani,21,4,250,1.0,1.0,gravity,vwc,per bucket,good,enough,dam,surface,communal standpipe multiple,functional,"(2000.0, 2010.0]"
3,other,263,other,other,Ruvuma / Southern Coast,90,63,58,1.0,1.0,submersible,vwc,never pay,good,dry,machine dbh,groundwater,communal standpipe multiple,non functional,"(1980.0, 1990.0]"
4,other,0,other,Shuleni,Lake Victoria,18,1,0,1.0,1.0,gravity,other,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,"(1980.0, 1990.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,other,1210,other,other,Pangani,3,5,125,1.0,1.0,gravity,water board,per bucket,good,enough,spring,groundwater,communal standpipe,functional,"(1990.0, 2000.0]"
59396,other,1212,other,other,Rufiji,11,4,56,1.0,1.0,gravity,vwc,annually,good,enough,river,surface,communal standpipe,functional,"(1990.0, 2000.0]"
59397,other,0,other,other,Rufiji,12,7,0,1.0,0.0,handpump,vwc,monthly,fluoride,enough,machine dbh,groundwater,hand pump,functional,"(1980.0, 1990.0]"
59398,other,0,other,other,Rufiji,1,4,0,1.0,1.0,handpump,vwc,never pay,good,insufficient,shallow well,groundwater,hand pump,functional,"(1980.0, 1990.0]"


In [7]:
int_to_object(df)

funder                    object
gps_height                 int64
installer                 object
wpt_name                  object
basin                     object
region_code               object
district_code             object
population                 int64
public_meeting            object
permit                    object
extraction_type_class     object
management                object
payment_type              object
quality_group             object
quantity                  object
source                    object
source_class              object
waterpoint_type           object
status_group              object
construction_year_bins    object
dtype: object

In [8]:
X = df.drop(["status_group"], axis=1)
y = df['status_group']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=465615615)

In [10]:
from sklearn import preprocessing

In [11]:
def grab_numeric(df):
    return df.select_dtypes(include=['int64'])

In [12]:
GrabNumeric = FunctionTransformer(grab_numeric)

In [13]:
GrabNumeric

FunctionTransformer(func=<function grab_numeric at 0x000001D98DDF9280>)

In [14]:
pipe= Pipeline(steps=[("num", GrabNumeric),
                      ( "ss", StandardScaler())])

In [15]:
pipe.fit(X_train)


Pipeline(steps=[('num',
                 FunctionTransformer(func=<function grab_numeric at 0x000001D98DDF9280>)),
                ('ss', StandardScaler())])

In [16]:
pipe.transform(X_train)

array([[-0.84040878,  0.25062257],
       [-0.96708423, -0.38007373],
       [ 0.55590018,  0.29266899],
       ...,
       [-0.96708423, -0.38007373],
       [ 0.77614273, -0.37797141],
       [-0.96708423, -0.38007373]])

In [17]:
j = -1
for i in X_train.columns:
    j+=1
    print("Index Number", j,i)

Index Number 0 funder
Index Number 1 gps_height
Index Number 2 installer
Index Number 3 wpt_name
Index Number 4 basin
Index Number 5 region_code
Index Number 6 district_code
Index Number 7 population
Index Number 8 public_meeting
Index Number 9 permit
Index Number 10 extraction_type_class
Index Number 11 management
Index Number 12 payment_type
Index Number 13 quality_group
Index Number 14 quantity
Index Number 15 source
Index Number 16 source_class
Index Number 17 waterpoint_type
Index Number 18 construction_year_bins


In [18]:
# We'll throw these mini-pipelines into our ColumnTransformer.

subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer(strategy="median")),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='constant')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [19]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [1,7]),
                                         ('subpipe_cat', subpipe_cat, [0,2,3,4,5,6,8,9,11,12,13,14,15,16,17,18])])

param_grid = {'learning_rate': [0.01,0.001, 0.0001,0.00001 ,0.000001],
               'max_depth': [3,4,5,6,7,8,9,10],
               'min_samples_leaf': [5,6,7,8,9,10],
               'n_estimators':[100, 200,300,400,500,600,700,800,900,1000]}

gbc_cv_pipe = Pipeline(steps=([("ct", CT), 
                                    ("gbc", GridSearchCV(estimator=GradientBoostingClassifier(),
                                 param_grid=param_grid,
                                 n_jobs=-1))]))

gbc_cv_pipe.fit(X_train, y_train)

gbc_model_pipe2 = Pipeline([('ct', CT), ('gbc', GradientBoostingClassifier(learning_rate=.01, n_estimators=1000, random_state=42))])

gbc_model_pipe.fit(X_train, y_train)

In [20]:
from catboost import CatBoostClassifier, Pool

In [21]:
cat_boost_pipe = Pipeline([('ct', CT), ('cat', CatBoostClassifier())])

In [22]:
cat_boost_pipe.fit(X_train, y_train)

Learning rate set to 0.096032
0:	learn: 1.0316760	total: 185ms	remaining: 3m 4s
1:	learn: 0.9792842	total: 193ms	remaining: 1m 36s
2:	learn: 0.9379838	total: 200ms	remaining: 1m 6s
3:	learn: 0.9031415	total: 208ms	remaining: 51.7s
4:	learn: 0.8737280	total: 215ms	remaining: 42.9s
5:	learn: 0.8500930	total: 223ms	remaining: 36.9s
6:	learn: 0.8284971	total: 230ms	remaining: 32.7s
7:	learn: 0.8111149	total: 238ms	remaining: 29.5s
8:	learn: 0.7954593	total: 245ms	remaining: 27s
9:	learn: 0.7825180	total: 252ms	remaining: 25s
10:	learn: 0.7715021	total: 260ms	remaining: 23.3s
11:	learn: 0.7610333	total: 268ms	remaining: 22.1s
12:	learn: 0.7523946	total: 276ms	remaining: 21s
13:	learn: 0.7444487	total: 284ms	remaining: 20s
14:	learn: 0.7363351	total: 292ms	remaining: 19.2s
15:	learn: 0.7298039	total: 299ms	remaining: 18.4s
16:	learn: 0.7239695	total: 307ms	remaining: 17.7s
17:	learn: 0.7192652	total: 314ms	remaining: 17.1s
18:	learn: 0.7128902	total: 323ms	remaining: 16.7s
19:	learn: 0.70808

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 7]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                               

In [23]:
cat_boost_pipe.score(X_train, y_train)

0.8154882154882155

In [24]:
cross_validate(cat_boost_pipe, X_train, y_train, return_train_score=True)

Learning rate set to 0.094945
0:	learn: 1.0321716	total: 7.6ms	remaining: 7.59s
1:	learn: 0.9813536	total: 14.2ms	remaining: 7.08s
2:	learn: 0.9394569	total: 21.6ms	remaining: 7.16s
3:	learn: 0.9042563	total: 28ms	remaining: 6.98s
4:	learn: 0.8754536	total: 35ms	remaining: 6.96s
5:	learn: 0.8517185	total: 41.7ms	remaining: 6.92s
6:	learn: 0.8300504	total: 48.4ms	remaining: 6.87s
7:	learn: 0.8134660	total: 55ms	remaining: 6.82s
8:	learn: 0.7973235	total: 61.7ms	remaining: 6.79s
9:	learn: 0.7834031	total: 68.3ms	remaining: 6.76s
10:	learn: 0.7721469	total: 74.9ms	remaining: 6.73s
11:	learn: 0.7613899	total: 81.8ms	remaining: 6.74s
12:	learn: 0.7519817	total: 88.7ms	remaining: 6.74s
13:	learn: 0.7429338	total: 95.4ms	remaining: 6.72s
14:	learn: 0.7354655	total: 102ms	remaining: 6.71s
15:	learn: 0.7290025	total: 109ms	remaining: 6.68s
16:	learn: 0.7227463	total: 115ms	remaining: 6.67s
17:	learn: 0.7173137	total: 122ms	remaining: 6.66s
18:	learn: 0.7118657	total: 129ms	remaining: 6.67s
19:	

{'fit_time': array([8.27291012, 8.59683466, 8.27746892, 8.25586677, 8.5577805 ]),
 'score_time': array([0.33707619, 0.33307528, 0.339077  , 0.33507633, 0.34107733]),
 'test_score': array([0.78103255, 0.78653199, 0.78720539, 0.78035915, 0.77979798]),
 'train_score': array([0.81933221, 0.81762065, 0.81675084, 0.81871493, 0.82084736])}

In [25]:
subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer(strategy="median")),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='constant')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [26]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [1,7]),
                                         ('subpipe_cat', subpipe_cat, [0,2,3,4,5,6,8,9,11,12,13,14,15,16,17,18])])

In [27]:
cat_boost_pipe = Pipeline([('ct', CT), ('cat', CatBoostClassifier())])
cat_boost_pipe.fit(X_train, y_train)

Learning rate set to 0.096032
0:	learn: 1.0316760	total: 18.7ms	remaining: 18.7s
1:	learn: 0.9792842	total: 26.2ms	remaining: 13.1s
2:	learn: 0.9379838	total: 49.2ms	remaining: 16.3s
3:	learn: 0.9031415	total: 56.4ms	remaining: 14s
4:	learn: 0.8737280	total: 64.2ms	remaining: 12.8s
5:	learn: 0.8500930	total: 71.8ms	remaining: 11.9s
6:	learn: 0.8284971	total: 79.7ms	remaining: 11.3s
7:	learn: 0.8111149	total: 86.7ms	remaining: 10.8s
8:	learn: 0.7954593	total: 94.5ms	remaining: 10.4s
9:	learn: 0.7825180	total: 102ms	remaining: 10.1s
10:	learn: 0.7715021	total: 109ms	remaining: 9.79s
11:	learn: 0.7610333	total: 117ms	remaining: 9.61s
12:	learn: 0.7523946	total: 124ms	remaining: 9.44s
13:	learn: 0.7444487	total: 132ms	remaining: 9.31s
14:	learn: 0.7363351	total: 140ms	remaining: 9.18s
15:	learn: 0.7298039	total: 147ms	remaining: 9.07s
16:	learn: 0.7239695	total: 156ms	remaining: 8.99s
17:	learn: 0.7192652	total: 163ms	remaining: 8.9s
18:	learn: 0.7128902	total: 171ms	remaining: 8.84s
19:	l

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 7]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                               

In [28]:
cat_boost_pipe1 = Pipeline([('ct', CT), ('cat', CatBoostClassifier(iterations=2000))])
cat_boost_pipe1.fit(X_train, y_train)

Learning rate set to 0.053462
0:	learn: 1.0605013	total: 39.8ms	remaining: 1m 19s
1:	learn: 1.0273360	total: 47.3ms	remaining: 47.2s
2:	learn: 0.9985485	total: 54.8ms	remaining: 36.4s
3:	learn: 0.9727186	total: 62.1ms	remaining: 31s
4:	learn: 0.9496684	total: 69.8ms	remaining: 27.9s
5:	learn: 0.9294207	total: 77.1ms	remaining: 25.6s
6:	learn: 0.9105796	total: 85.2ms	remaining: 24.3s
7:	learn: 0.8943755	total: 92.6ms	remaining: 23.1s
8:	learn: 0.8782036	total: 101ms	remaining: 22.4s
9:	learn: 0.8642817	total: 109ms	remaining: 21.7s
10:	learn: 0.8515407	total: 116ms	remaining: 21.1s
11:	learn: 0.8398244	total: 124ms	remaining: 20.6s
12:	learn: 0.8290215	total: 133ms	remaining: 20.3s
13:	learn: 0.8185782	total: 141ms	remaining: 20.1s
14:	learn: 0.8090256	total: 149ms	remaining: 19.8s
15:	learn: 0.8003005	total: 157ms	remaining: 19.5s
16:	learn: 0.7923261	total: 164ms	remaining: 19.2s
17:	learn: 0.7847083	total: 172ms	remaining: 18.9s
18:	learn: 0.7772340	total: 180ms	remaining: 18.8s
19:	

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 7]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                               

In [29]:
cat_boost_pipe1.score(X_train, y_train)

0.8188776655443322

In [30]:
cat_boost_pipe.score(X_train, y_train)

0.8154882154882155

In [31]:
y_hat1 = cat_boost_pipe1.predict(X_test)
y_hat1

array([['functional'],
       ['functional'],
       ['functional'],
       ...,
       ['functional'],
       ['functional'],
       ['functional']], dtype=object)

In [32]:
y_hat = cat_boost_pipe.predict(X_test)
y_hat

array([['functional'],
       ['functional'],
       ['functional'],
       ...,
       ['functional'],
       ['functional'],
       ['functional']], dtype=object)

In [33]:
print(f"""
Our second model's accuracy on the test set is {round(accuracy_score(y_test, y_hat1), 2)}. \n
Our second model's recall on the test set is {round(recall_score(y_test, y_hat1,average= 'macro'), 2)} \n
Our second model's precision on the test set is {round(precision_score(y_test, y_hat1,average= 'macro'), 2)} \n
Our second model's f1-score on the test is {round(f1_score(y_test, y_hat1,average= 'macro'), 2)}.
""")


Our second model's accuracy on the test set is 0.79. 

Our second model's recall on the test set is 0.62 

Our second model's precision on the test set is 0.73 

Our second model's f1-score on the test is 0.65.



In [34]:
print(f"""
Our first model's accuracy on the test set is {round(accuracy_score(y_test, y_hat), 2)}. \n
Our first model's recall on the test set is {round(recall_score(y_test, y_hat,average= 'macro'), 2)} \n
Our first model's precision on the test set is {round(precision_score(y_test, y_hat,average= 'macro'), 2)} \n
Our first model's f1-score on the test is {round(f1_score(y_test, y_hat,average= 'macro'), 2)}.
""")


Our first model's accuracy on the test set is 0.79. 

Our first model's recall on the test set is 0.62 

Our first model's precision on the test set is 0.73 

Our first model's f1-score on the test is 0.65.



In [35]:
cat_boost_pipe1.score(X_train, y_train)

0.8188776655443322

In [36]:
cat_boost_pipe1 = Pipeline([('ct', CT), ('cat', CatBoostClassifier(iterations=2000))])
cat_boost_pipe1.fit(X_train, y_train)

Learning rate set to 0.053462
0:	learn: 1.0605013	total: 9.47ms	remaining: 18.9s
1:	learn: 1.0273360	total: 17.3ms	remaining: 17.3s
2:	learn: 0.9985485	total: 25.1ms	remaining: 16.7s
3:	learn: 0.9727186	total: 32.7ms	remaining: 16.3s
4:	learn: 0.9496684	total: 40.7ms	remaining: 16.2s
5:	learn: 0.9294207	total: 48.4ms	remaining: 16.1s
6:	learn: 0.9105796	total: 56ms	remaining: 16s
7:	learn: 0.8943755	total: 63.7ms	remaining: 15.9s
8:	learn: 0.8782036	total: 71.5ms	remaining: 15.8s
9:	learn: 0.8642817	total: 78.9ms	remaining: 15.7s
10:	learn: 0.8515407	total: 86.2ms	remaining: 15.6s
11:	learn: 0.8398244	total: 93.6ms	remaining: 15.5s
12:	learn: 0.8290215	total: 101ms	remaining: 15.5s
13:	learn: 0.8185782	total: 109ms	remaining: 15.5s
14:	learn: 0.8090256	total: 117ms	remaining: 15.5s
15:	learn: 0.8003005	total: 125ms	remaining: 15.5s
16:	learn: 0.7923261	total: 133ms	remaining: 15.5s
17:	learn: 0.7847083	total: 141ms	remaining: 15.5s
18:	learn: 0.7772340	total: 149ms	remaining: 15.5s
19:

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 7]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                               

In [37]:
cat_boost_pipe1.score(X_train, y_train)

0.8188776655443322

In [39]:
y_hat2 = cat_boost_pipe1.predict(X_test)

array([['functional'],
       ['functional'],
       ['functional'],
       ...,
       ['functional'],
       ['functional'],
       ['functional']], dtype=object)

In [40]:
print(f"""
Our second model's accuracy on the test set is {accuracy_score(y_test, y_hat2)}. \n
Our second model's recall on the test set is {recall_score(y_test, y_hat2,average= 'macro')} \n
Our second model's precision on the test set is {precision_score(y_test, y_hat2,average= 'macro')} \n
Our second model's f1-score on the test is {f1_score(y_test, y_hat2,average= 'macro')}.
""")


Our second model's accuracy on the test set is 0.7893602693602694. 

Our second model's recall on the test set is 0.6220300943894689 

Our second model's precision on the test set is 0.7315158924592868 

Our second model's f1-score on the test is 0.6487107469250327.



In [41]:
cat_boost_pipe1 = Pipeline([('ct', CT), ('cat', CatBoostClassifier(iterations=10000, od_type = "Iter", od_wait=1000, ))])
cat_boost_pipe1.fit(X_train, y_train)

Learning rate set to 0.013722
0:	learn: 1.0886302	total: 112ms	remaining: 18m 38s
1:	learn: 1.0790523	total: 120ms	remaining: 10m 1s
2:	learn: 1.0699029	total: 129ms	remaining: 7m 10s
3:	learn: 1.0610915	total: 137ms	remaining: 5m 41s
4:	learn: 1.0525934	total: 144ms	remaining: 4m 47s
5:	learn: 1.0445068	total: 153ms	remaining: 4m 14s
6:	learn: 1.0363221	total: 161ms	remaining: 3m 49s
7:	learn: 1.0286993	total: 170ms	remaining: 3m 32s
8:	learn: 1.0208572	total: 178ms	remaining: 3m 17s
9:	learn: 1.0135048	total: 185ms	remaining: 3m 4s
10:	learn: 1.0065414	total: 193ms	remaining: 2m 54s
11:	learn: 0.9998025	total: 201ms	remaining: 2m 47s
12:	learn: 0.9931150	total: 210ms	remaining: 2m 41s
13:	learn: 0.9863122	total: 219ms	remaining: 2m 35s
14:	learn: 0.9797986	total: 227ms	remaining: 2m 31s
15:	learn: 0.9734089	total: 235ms	remaining: 2m 26s
16:	learn: 0.9672126	total: 243ms	remaining: 2m 22s
17:	learn: 0.9613133	total: 250ms	remaining: 2m 18s
18:	learn: 0.9554481	total: 258ms	remaining:

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 7]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                               

In [42]:
cat_boost_pipe1.score(X_train, y_train)

0.8274074074074074

In [45]:
cross_validate(cat_boost_pipe1, X_train, y_train, return_train_score=True)

Learning rate set to 0.013567
0:	learn: 1.0887098	total: 17.9ms	remaining: 2m 59s
1:	learn: 1.0794054	total: 24.4ms	remaining: 2m 1s
2:	learn: 1.0703208	total: 31ms	remaining: 1m 43s
3:	learn: 1.0616144	total: 38.3ms	remaining: 1m 35s
4:	learn: 1.0532132	total: 45ms	remaining: 1m 30s
5:	learn: 1.0452044	total: 52.6ms	remaining: 1m 27s
6:	learn: 1.0370614	total: 59.3ms	remaining: 1m 24s
7:	learn: 1.0296181	total: 66.1ms	remaining: 1m 22s
8:	learn: 1.0218239	total: 73ms	remaining: 1m 21s
9:	learn: 1.0146120	total: 79.6ms	remaining: 1m 19s
10:	learn: 1.0076755	total: 86.1ms	remaining: 1m 18s
11:	learn: 1.0007029	total: 93.2ms	remaining: 1m 17s
12:	learn: 0.9940871	total: 100ms	remaining: 1m 17s
13:	learn: 0.9873604	total: 108ms	remaining: 1m 17s
14:	learn: 0.9810441	total: 115ms	remaining: 1m 16s
15:	learn: 0.9747137	total: 122ms	remaining: 1m 16s
16:	learn: 0.9685540	total: 129ms	remaining: 1m 15s
17:	learn: 0.9626708	total: 136ms	remaining: 1m 15s
18:	learn: 0.9568152	total: 142ms	remai

{'fit_time': array([73.99395227, 76.06545806, 78.06176448, 77.34013438, 80.64526391]),
 'score_time': array([0.37808537, 0.38019848, 0.3790853 , 0.38208675, 0.37508416]),
 'test_score': array([0.78529742, 0.79023569, 0.79147026, 0.78439955, 0.78237935]),
 'train_score': array([0.83075196, 0.83159371, 0.83089226, 0.83131313, 0.83299663])}

In [46]:
y_hat2 = cat_boost_pipe1.predict(X_test)


In [47]:
print(f"""
Our first model's accuracy on the test set is {round(accuracy_score(y_test, y_hat2), 2)}. \n
Our first model's recall on the test set is {round(recall_score(y_test, y_hat2,average= 'macro'), 2)} \n
Our first model's precision on the test set is {round(precision_score(y_test, y_hat2,average= 'macro'), 2)} \n
Our first model's f1-score on the test is {round(f1_score(y_test, y_hat2,average= 'macro'), 2)}.
""")


Our first model's accuracy on the test set is 0.79. 

Our first model's recall on the test set is 0.63 

Our first model's precision on the test set is 0.73 

Our first model's f1-score on the test is 0.65.

