In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearnex import patch_sklearn
patch_sklearn(verbose=False)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
import scipy.stats as stats
import ourfunctions

In [2]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')
y = pd.DataFrame(LabelEncoder().fit_transform(y.status_group))

X['date_recorded'] = pd.to_datetime(X['date_recorded']).astype(np.int64)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 829941045)

In [3]:
model_run = ourfunctions.Modeler(X=X, y=y)

#### Basic KNN model

Includes normal cross validate for comparison, to show class working as expected.


In [4]:
numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median'))]
)

numeric_preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, make_column_selector(dtype_include=np.number)),
    ]
)

In [5]:
knn = {'classifier': KNeighborsClassifier(), 'preprocessor': numeric_preprocessor}

model_run.add_model('KNN', knn)

In [6]:

cross_val_score(estimator=KNeighborsClassifier(),
                X=numeric_preprocessor.fit_transform(X_train), 
                y=y_train)


array([0.52839506, 0.51795735, 0.52177329, 0.52267116, 0.52457912])

In [7]:
model_run.train_all()

### Random Forest Attempt

In [8]:
RandomFM = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4), 'preprocessor': numeric_preprocessor}
model_run.add_model('RandomForestNew', RandomFM)

In [9]:
model_run.train_model('RandomForestNew')

root - INFO - sklearn.ensemble.RandomForestClassifier.fit: running accelerated version on CPU
root - INFO - RandomForestNew has been fit.
root - INFO - sklearn.ensemble.RandomForestClassifier.fit: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.predict: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.fit: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.predict: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.fit: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.predict: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.fit: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.predict: running accelerated version on CPU
root - INFO - sklearn.ensemble.RandomForestClassifier.fit: running accelerated version on CPU


In [11]:
RandomFM = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4), 'preprocessor': None}
model_run.add_model('RandomForestMore', RandomFM)

model_run.train_model('RandomForestMore')

In [13]:
RFRCV_params = dict(n_estimators=list(range(100,105)),
                    criterion=['gini', 'entropy'])

search_options = {'n_jobs': 3, 'random_state': 9280210}

model_run.hyper_search('RandomForestMore', params=RFRCV_params, searcher_kwargs=search_options)

In [15]:
model_run.show_model('RandomForestMore')

RandomForestMore: {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4), 'preprocessor': ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f9eae7714f0>),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='Missing',
                                                                strategy='constant')),
                                                 ('casting',
                                                  FunctionTransformer(func=<function to_object at 0x7f9eaf211dc0>)),
                                                 ('one_hot_encode',
                                                  One

<class 'sklearn.model_selection._search.RandomizedSearchCV'>
