# Model Training

Link to the project: drinkability of water 
[(fr)](https://drive.google.com/file/d/1FGNR1O8EKGVKpVB_PMb5Ty2LipYgoM8q/view?usp=sharing)
[(kaggle)](https://www.kaggle.com/artimule/drinking-water-probability)

In this notebook, we will train the model.

We will follow these different steps:
* Preprocessing
    - filling missing data
    - polynomial features?
    - binning features?
    - apply some transformation?
* Training models
* Ensembling models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer

## Preprocessing

In [2]:
TEST_SIZE = 0.2
RANDOM_STATE = 42

### Import the data

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# import data
path = "/content/drive/MyDrive/Best ML model ever/input/drinking_water_potability.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


### Train/test split

As there are only continuous data, we don't need to use stratified sampling.

In [5]:
train_set, test_set = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)
print("Train shape:", train_set.shape)
print("Test shape:", test_set.shape)

Train shape: (2620, 10)
Test shape: (656, 10)


In [6]:
X_train = train_set.drop("Potability", axis=1)
y_train = train_set["Potability"].copy()

X_test = test_set.drop("Potability", axis=1)
y_test = test_set["Potability"].copy()

### Feature **Engineering**

#### Adding features

In [7]:
df.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.6903,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833605,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762125,8.114887,359.95017,481.792305,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.19601,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


We notice that the feature 'Solids' is the only variable with a high variance. One way to reduce its variance is by using a log-transformation.

In [77]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    '''Defines a transformer to add custom features'''

    solids_ix = 2

    def __init__(self, add_Solids_log = True):
        self.add_Solids_log = add_Solids_log

    def fit(self, X, y=None):
        return self 

    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame): X = X.values
        if self.add_Solids_log:
            Solids_log = np.log1p(X[:, self.solids_ix])
            return np.c_[X, Solids_log]
        else:
            return X

In [78]:
res = AttributesAdder().fit_transform(df)
res = pd.DataFrame(res, columns=list(df.columns) + ['Solids_log'])
res

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability,Solids_log
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0.0,9.942339
1,3.716080,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,0.0,9.832585
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,0.0,9.899005
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0.0,9.999680
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0.0,9.797015
...,...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681736,47580.99160,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1.0,10.770210
3272,7.808856,193.553212,17329.80216,8.061362,,392.449580,19.903225,,2.798243,1.0,9.760241
3273,9.419510,175.762646,33155.57822,7.350233,,432.044783,11.039070,69.845400,3.298875,1.0,10.408996
3274,5.126763,230.603758,11983.86938,6.303357,,402.883113,11.168946,77.488213,4.708658,1.0,9.391400


In [79]:
res.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability,Solids_log
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011,9.911939
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849,0.445392
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0,5.774373
25%,6.093092,176.850538,15666.6903,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0,9.659356
50%,7.036752,196.967627,20927.833605,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0,9.948883
75%,8.062066,216.667456,27332.762125,8.114887,359.95017,481.792305,16.557652,77.337473,4.50032,1.0,10.215878
max,14.0,323.124,61227.19601,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0,11.022363


### Building the pipeline

In [22]:
class RemoveNull(BaseEstimator, TransformerMixin):
    '''Defines a transformer to delete rows or cols containing null values'''

    def __init__(self, direction=0):
        self.direction = direction

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.dropna(axis=self.direction)

In [74]:
def preprocessing_pipeline(missing="median", scaling="standard"):
    """
    This function's goal is to build a preprocessing pipeline with given preprocessing strategy.

    Parameters
    ----------
    missing : string
        Specify the strategy for dealing with the missing values (default is "mean")
        Possible values: "mean", "median", “most_frequent”, "remove_rows", "remove_cols", "regression", "stochastic", "knn"
    scaling : string
        Specify the strategy for dealing with the scaling (default is "standard")
        Possible values: "standard", "min_max"

    Returns
    -------
    sklearn.Pipeline
        The preprocessing pipeline with given strategies
    """
    # Missing
    if missing in ["mean", "median", "most_frequent"]:
        missing_imputer = SimpleImputer(strategy=missing)
    elif missing in ["remove_rows", "remove_cols"]:
        missing_imputer = RemoveNull(0 if missing == "remove_rows" else 1)
    elif missing in ["regression", "stochastic"]:
        missing_imputer = IterativeImputer(
            sample_posterior=(missing == "stochastic"))
    elif missing == "knn":
        missing_imputer = KNNImputer()

    # Added attributes
    attr_adder = AttributesAdder()

    # Scaling
    if scaling == "standard":
        scaler = StandardScaler()
    elif scaling == "min_max":
        scaler = MinMaxScaler(feature_range=(-1, 1))

    return Pipeline([
        ('missing', missing_imputer),
        ('attribs_adder', attr_adder),
        ('scaling', scaler)
    ])


In [80]:
pipeline = preprocessing_pipeline()

In [81]:
pipeline.get_params()

{'attribs_adder': AttributesAdder(add_Solids_log=True),
 'attribs_adder__add_Solids_log': True,
 'memory': None,
 'missing': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='median', verbose=0),
 'missing__add_indicator': False,
 'missing__copy': True,
 'missing__fill_value': None,
 'missing__missing_values': nan,
 'missing__strategy': 'median',
 'missing__verbose': 0,
 'scaling': StandardScaler(copy=True, with_mean=True, with_std=True),
 'scaling__copy': True,
 'scaling__with_mean': True,
 'scaling__with_std': True,
 'steps': [('missing',
   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                 missing_values=nan, strategy='median', verbose=0)),
  ('attribs_adder', AttributesAdder(add_Solids_log=True)),
  ('scaling', StandardScaler(copy=True, with_mean=True, with_std=True))],
 'verbose': False}

In [83]:
pipeline.named_steps

{'attribs_adder': AttributesAdder(add_Solids_log=True),
 'missing': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='median', verbose=0),
 'scaling': StandardScaler(copy=True, with_mean=True, with_std=True)}