In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearnex import patch_sklearn
patch_sklearn(verbose=False)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
import scipy.stats as stats
import ourfunctions

In [2]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')
y = pd.DataFrame(LabelEncoder().fit_transform(y.status_group))

X['date_recorded'] = pd.to_datetime(X['date_recorded']).astype(np.int64)

#### Preprocessors

In [3]:
# Super basic numeric transformer

numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median'))]
)

numeric_preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, make_column_selector(dtype_include=np.number)),
    ]
)

### Models

In [4]:
knn = {'classifier': KNeighborsClassifier(n_jobs=3), 'preprocessor': None}
log_reg_basic = {'classifier': LogisticRegression(C=1e6, n_jobs=3), 'preprocessor': None}
RandomFM_basic = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4, n_jobs=3), 'preprocessor': numeric_preprocessor}
RandomFM_all_cols = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4, n_jobs=3), 'preprocessor': None}
RandomFM_default = {'classifier': RandomForestClassifier(n_jobs=3), 'preprocessor': None} # Included for RandomCVSearch later on

models = {'knn': knn, 'log_reg_basic': log_reg_basic, 'RandomFM_basic': RandomFM_basic, 'RandomFM_all_cols': RandomFM_all_cols, 'RandomFM_default': RandomFM_default}

### Modeler

In [5]:
model_run = ourfunctions.Modeler(models, X=X, y=y)

# after the model_run object is created so we can add onto the default preprocessor.
log_reg_regularized = {'classifier': LogisticRegression(n_jobs=3), 'preprocessor': model_run.create_default_prep(num_add=[('scaling', StandardScaler())])}
model_run.add_model('log_reg_regularized', log_reg_regularized)

### Search parameters and kwargs

In [6]:
LogRegRCV_params = dict(penalty=['l1', 'l2', 'elasticnet'],
                        C=stats.uniform(loc=1, scale=10),
                        max_iter=list(range(100,400)))

RandForestRCV_params = dict(n_estimators=list(range(100,300)),
                            criterion=['gini', 'entropy'],
                            max_depth = list(range(20,50)),
                            min_samples_split = list(range(2, 10)))

search_options = {'n_jobs': 3, 'random_state': 9280210, 'n_iter': 20}

In [7]:
model_run.hyper_search('log_reg_regularized', params=LogRegRCV_params, searcher_kwargs=search_options)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
model_run.hyper_search('RandomFM_default', params=RandForestRCV_params, searcher_kwargs=search_options, set_to_train=True)

In [9]:
model_run.test_model('RandomFM_default')

root - INFO - sklearn.ensemble.RandomForestClassifier.predict: fallback to original Scikit-learn
root - INFO - sklearn.ensemble.RandomForestClassifier.predict_proba: fallback to original Scikit-learn
root - INFO - RandomFM_default test score: 0.7924579124579124
