# Boosting

In [127]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import ourfunctions
import logging
import xgboost

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline



In [128]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')
y = pd.DataFrame(LabelEncoder().fit_transform(y.status_group))

X['date_recorded'] = pd.to_datetime(X['date_recorded']).astype(np.int64)

### Preprocessors

In [130]:
numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median'))]
)

numeric_preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, make_column_selector(dtype_include=np.number)),
    ]
)

## Models

In [129]:
model_run = ourfunctions.Modeler(X=X, y=y)

### Adaptive Boosting (AdaBoost)

Adapts two related series of weights, one assigned to the datapoints and the other to the learners themselves. Datapoints that are incorrectly classifeid receive greater weights for the next learner sequence. This allows learners to make better preditictions at the end of the sequence. THe leaners that make better predictions have more weight in the final prediction.

In [131]:
AdaBoostClass = {'classifier': AdaBoostClassifier(), 'preprocessor': numeric_preprocessor}
model_run.add_model('AdaBoost', AdaBoostClass)

In [132]:
model_run.train_model('AdaBoost')

  return f(**kwargs)
root - INFO - AdaBoost has been fit.
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
root - INFO - Cross validate scores for AdaBoost: [0.62906846 0.62940516 0.63310887 0.62873176 0.61851852]


In [139]:
model_run.test_model('AdaBoost')

root - INFO - AdaBoost test score: 0.6263973063973064


## Gradient Boosting

In [133]:
GradBoostClass = {'classifier': GradientBoostingClassifier(random_state=829941045), 'preprocessor': numeric_preprocessor}
model_run.add_model('GradBoost', GradBoostClass)

In [134]:
model_run.train_model('GradBoost')

  return f(**kwargs)
root - INFO - GradBoost has been fit.
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
root - INFO - Cross validate scores for GradBoost: [0.65690236 0.66262626 0.66419753 0.65387205 0.64680135]


In [135]:
model_run.test_model('GradBoost')

root - INFO - GradBoost test score: 0.6572390572390573


In [None]:
GradBoostClass2 = {'classifier': GradientBoostingClassifier(random_state=829941045), 'preprocessor': numeric_preprocessor, regressor__min_samples_leaf}
model_run.add_model('GradBoost2', GradBoostClass2)

## Support Vector Machine

From documentation:
Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.

The advantages of support vector machines are:

Effective in high dimensional spaces.

Still effective in cases where number of dimensions is greater than the number of samples.

Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.

Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.

SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [136]:
SVM = {'classifier': svm.SVC(random_state=829941045), 'preprocessor': numeric_preprocessor}
model_run.add_model('SVM', SVM)

In [137]:
model_run.train_model('SVM')

  return f(**kwargs)
root - INFO - SVM has been fit.
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
root - INFO - Cross validate scores for SVM: [0.54242424 0.54242424 0.54242424 0.54253648 0.54242424]


In [138]:
model_run.test_model('SVM')

root - INFO - SVM test score: 0.544983164983165
