In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [4]:
# Load Data Frame
df = pd.read_csv("../data/cancer_reg.csv")
# make a copy of our data frame for later use
df_copy = df.copy()
# view dimensions of dataset
df.shape

(3047, 34)

In [5]:
# preview the dataset
df.head()

Unnamed: 0,index,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,binnedinc,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
0,0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,"(61494.5, 125635]",...,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
2,2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",...,43.5,34.9,42.1,21.1,90.92219,0.739673,0.465898,2.747358,54.444868,3.729488
3,3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    3047 non-null   int64  
 1   avganncount              3047 non-null   float64
 2   avgdeathsperyear         3047 non-null   int64  
 3   target_deathrate         3047 non-null   float64
 4   incidencerate            3047 non-null   float64
 5   medincome                3047 non-null   int64  
 6   popest2015               3047 non-null   int64  
 7   povertypercent           3047 non-null   float64
 8   studypercap              3047 non-null   float64
 9   binnedinc                3047 non-null   object 
 10  medianage                3047 non-null   float64
 11  medianagemale            3047 non-null   float64
 12  medianagefemale          3047 non-null   float64
 13  geography                3047 non-null   object 
 14  percentmarried          

In [7]:
mis_val_count = df.isnull().sum()
mis_val_len  = len(mis_val_count[mis_val_count> 0])
print("Total number of missing entries: ", mis_val_count.sum())
print("Total number of columns that have missing values: ", mis_val_len)
print("These columns are:")
print(mis_val_count[mis_val_count > 0])

Total number of missing entries:  3046
Total number of columns that have missing values:  3
These columns are:
pctsomecol18_24            2285
pctemployed16_over          152
pctprivatecoveragealone     609
dtype: int64


In [8]:
df.drop(["pctsomecol18_24", "index"], axis=1, inplace=True)

In [9]:
mis_val_count = df.isnull().sum()
mis_val_after_drop = mis_val_count[mis_val_count > 0]
mising_val_col_list = [i for i in mis_val_after_drop.index]
mising_val_col_list

['pctemployed16_over', 'pctprivatecoveragealone']

In [10]:
# Imputation
my_imputer = SimpleImputer(strategy="mean")
df[mising_val_col_list] = pd.DataFrame(my_imputer.fit_transform(df[mising_val_col_list]))

In [11]:
# so let's check if there is any missing values
mis_val_count = df.isnull().sum()
mis_val_after_imput = mis_val_count[mis_val_count > 0]
mis_val_after_imput

Series([], dtype: int64)

In [12]:
# select object columns
object_col = [col for col in df.columns if df[col].dtype == 'object']
print("Object columns: ", object_col)

Object columns:  ['binnedinc', 'geography']


In [13]:
print("Unique values in 'geography' column: \n", df['geography'].unique())
print("Number of the unique values is: ", df['geography'].nunique())

print('-'*280)

print("Unique values in 'binnedinc' column: \n", df['binnedinc'].unique())
print("Number of the unique values is: ", df['binnedinc'].nunique())

Unique values in 'geography' column: 
 ['Kitsap County, Washington' 'Kittitas County, Washington'
 'Klickitat County, Washington' ... 'Ford County, Kansas'
 'Franklin County, Kansas' 'Geary County, Kansas']
Number of the unique values is:  3047
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Unique values in 'binnedinc' column: 
 ['(61494.5, 125635]' '(48021.6, 51046.4]' '(42724.4, 45201]'
 '(51046.4, 54545.6]' '(37413.8, 40362.7]' '(40362.7, 42724.4]'
 '(54545.6, 61494.5]' '(34218.1, 37413.8]' '[22640, 34218.1]'
 '(45201, 48021.6]']
Number of the unique values is:  10


In [14]:
# Select Columns for Ordinal Encoding
col_for_ord_enc = [col for col in object_col if df[col].nunique() > 10]
print("Columns for Ordinal Encoding are: ", col_for_ord_enc)

Columns for Ordinal Encoding are:  ['geography']


In [15]:
# Apply ordinal encoder
ordinal_encoder = OrdinalEncoder()
df[col_for_ord_enc] = ordinal_encoder.fit_transform(df[col_for_ord_enc])

In [16]:
# Select Columns for One Hot Encoding
col_for_onehot_enc = [col for col in object_col if df[col].nunique() <= 10]
print("Columns for One Hot Encoding are: ", col_for_onehot_enc)

Columns for One Hot Encoding are:  ['binnedinc']


In [17]:
# Apply one-hot encoder
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[col_for_onehot_enc]))

# One-hot encoding removed index; put it back
OH_cols.index = df.index

# Remove categorical columns (will replace with one-hot encoding)
df.drop(col_for_onehot_enc, axis=1, inplace=True)

# Add one-hot encoded columns to our data frame
df = pd.concat([df, OH_cols], axis=1)

In [18]:
# Preview df
df.head()

Unnamed: 0,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,medianage,medianagemale,...,0,1,2,3,4,5,6,7,8,9
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,39.3,36.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,33.0,32.2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,45.0,44.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,42.8,42.2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,48.3,47.8,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [19]:
print("Data Frame Shape After Encoding Is: ", df.shape)

Data Frame Shape After Encoding Is:  (3047, 41)


In [20]:
# Declare feature vector and target variable
X = df.drop(columns = ['target_deathrate'])
y = df.target_deathrate

# Split data into separate training and test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

In [21]:
# Declare feature vector and target variable
X = df_copy.drop(['target_deathrate'], axis=1)
y = df_copy['target_deathrate']

# Split data into separate training and test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

In [22]:
# select columns to drop
drop_cols_list = ['index','pctsomecol18_24']
# Select numerical columns
numerical_cols = [cname for cname in  X_train.columns if X_train[cname].dtype in ['int64', 'float64'] and cname != drop_cols_list[0] and cname != drop_cols_list[1]]
# select object columns
object_col = [col for col in  X_train.columns if  X_train[col].dtype == 'object' and col != drop_cols_list[0] and col != drop_cols_list[1]]
# Select Columns for Ordinal Encoding
col_for_ord_enc = [col for col in object_col if  X_train[col].nunique() > 10]
# Select Columns for One Hot Encoding
col_for_onehot_enc = [col for col in object_col if  X_train[col].nunique() <= 10]
cat_geo_lvl = [val for val in df_copy[col_for_ord_enc].value_counts().index]
print("Numerical columns list:\n",numerical_cols)
print('-'*100)
print("Ordinal Encoding Columns list:\n",col_for_ord_enc)
print('-'*100)
print("One Hot Encoding Columns list:\n",col_for_onehot_enc)

Numerical columns list:
 ['avganncount', 'avgdeathsperyear', 'incidencerate', 'medincome', 'popest2015', 'povertypercent', 'studypercap', 'medianage', 'medianagemale', 'medianagefemale', 'percentmarried', 'pctnohs18_24', 'pcths18_24', 'pctbachdeg18_24', 'pcths25_over', 'pctbachdeg25_over', 'pctemployed16_over', 'pctunemployed16_over', 'pctprivatecoverage', 'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage', 'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian', 'pctotherrace', 'pctmarriedhouseholds', 'birthrate']
----------------------------------------------------------------------------------------------------
Ordinal Encoding Columns list:
 ['geography']
----------------------------------------------------------------------------------------------------
One Hot Encoding Columns list:
 ['binnedinc']


In [23]:
df_copy['avganncount'].head(1).values

array([1397.])

In [24]:
df_copy[col_for_ord_enc].head(1).values

array([['Kitsap County, Washington']], dtype=object)

In [25]:
df_copy[col_for_onehot_enc].head(1).values

array([['(61494.5, 125635]']], dtype=object)

In [26]:
model = RandomForestRegressor(random_state=0)

In [27]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return OrdinalEncoder().fit_transform(X);

In [28]:
# Preprocessing for df

# write a custom Column Dropper Transformer
class ColumnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 

# apply transformer
col_drop_trans = Pipeline([
    ("ColumnDropper", ColumnDropperTransformer(drop_cols_list))
])

# Preprocessing for numerical data
num_trans = SimpleImputer(strategy = "mean")

# Preprocessing for categorical data
# we will add simple imputer in case of missing values
# will be added in the future
ord_cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', CustomOrdinalEncoder())
])

oh_cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


# Bundle preprocessing
preprocessor = ColumnTransformer(
    transformers= [  
        ('drop', col_drop_trans, drop_cols_list),
        ('num', num_trans, numerical_cols),
               
        ('ord_cat', ord_cat_trans, col_for_ord_enc), 
        ('oh_cat', oh_cat_trans, col_for_onehot_enc)
    ]
)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
my_pipeline

In [29]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 13.341644262295086




In [30]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline,
                              X,
                              y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores: \n", scores)
print('-'*280)
print("Average MAE score (across experiments):")
print(scores.mean())



MAE scores: 
 [13.98175082 13.70747213 14.03361905 14.32246798 14.48820197]
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Average MAE score (across experiments):
14.106702389835524




In [31]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.1)

# Let's add the Model to a Pipeline
# Bundle preprocessing and modeling code in a pipeline
XGB_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', my_model)
                             ])


# Make a copy to avoid changing original data
X_valid = X_test.copy()

#The problem is that pipelines do not fit eval_set.
#So, we need to preprocess X_valid.
#To do that the easiest way is using your pipeline without the 'model' step.
#Use the following code before fitting our pipeline:

# fit transform X_valid copy
preprocessor.fit(X_valid)
X_valid_transformed = preprocessor.transform(X_valid)

#Then fit our pipeline after changing model__eval_set as follows:
XGB_pipeline.fit(X_train, y_train, 
                model__eval_set=[(X_valid_transformed, y_test)],
                model__verbose=False)

preds = XGB_pipeline.predict(X_valid)

print("MAE Score is:", mean_absolute_error(y_test, preds))



MAE Score is: 11.879904102262904


