In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import logging
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [11]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')

X['date_recorded'] = pd.to_datetime(X['date_recorded'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 829941045)

In [6]:
drop_cols = []

In [7]:
X.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [21]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(exclude=['int64', 'float64']).columns

numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median'))]
)

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Missing'))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, num_cols),
        ("categorical", categorical_transformer, cat_cols)
    ]
)


float

In [None]:
class Modeler:
    """
    Modeling pipeline. It has basic defaults and can accept new models and transformers.
    Models should be added in the form of:

    {'classifier': <classifier>,
     'preprocessor': <preprocessor>}

    preprocessor can be None if the default preprocessor is acceptable. This class also
    logs model output to a default model-run.log file.
    """
    def __init__(self, models={}, num=numeric_transformer, cat=categorical_transformer, prep=preprocessor, X=None, y= None, log='model-run.log'):
        self._models=models
        self._numeric=num
        self._categorical=cat
        self._preprocessor=prep
        self._log = log
        logging.basicConfig(filename=log, level=logging.DEBUG)
        if X and y:
            self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(X, y, test_size=0.25, random_state = 829941045)
        else:
            self._X_train, self._X_test, self._y_train, self._y_test = None, None, None, None
            

    def add_model(self, name, model):
        self._models[name] = model
        self._models[name]['cv_output'] = None
        self._models[name]['fit_classifier'] = None
        self._models[name]['time_ran'] = None

    def change_prep(self, name, prep):
        self._models[name]['preprocessor'] = prep

    def show_model(self, name):
        print(f"{name}: {self._models[name]}")

    def train_model(self, name, X_train=None, y_train=None, print=True):
        if not X_train:
            X_train = self._X_train
        if not y_train:
            y_train = self._y_train
        model = self._models[name]
        X_train_processed = model['preprocessor'].fit_transform(X_train)
        model['fit_classifier'] = model['classifier'].fit(X_train_processed, y_train)

    def train_all(self, X_train, y_train, print=False):
        pass

    def test_model(self, name, X_test, y_test, print=True):
        pass

    def test_all(self, X_test, y_test, print=False):
        pass

    def plot_models(self):
        """Skylar slide style."""
        pass
