# Hyperparameter tuning, using all 8 classifiers individually. After choosing the best parameters for each classifier, they will be input into the ensemble models. 

In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, log_loss
from yellowbrick.classifier import ClassificationReport
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import time
import matplotlib.pyplot as plt
%matplotlib inline

df_model = pd.DataFrame(pd.read_csv('../2019Data/df_Models2019.csv'))
X = df_model.iloc[:, :-1]
y = df_model.result
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(y_train.unique()) # binary classfication
X_train

[1 0]


Unnamed: 0,shoot_eff,score_op,off_rtg,def_rtg,sos,ie,efg_pct,to_poss,orb_pct,ft_rate,...,seed_7,seed_8,seed_9,seed_10,seed_11,seed_12,seed_13,seed_14,seed_15,seed_16
395,0.010383,0.010420,2.065874,1.369455,0.696419,1.379193,0.005367,-0.002182,0.024726,0.023743,...,0,0,255,0,0,0,0,0,0,0
549,-0.018557,0.018141,0.665276,0.421097,0.244179,1.796148,-0.012351,-0.022971,-0.008802,0.000361,...,0,0,0,0,0,0,0,255,0,0
841,0.105896,-0.008048,9.763181,4.680730,5.082451,2.546429,0.060811,-0.021138,-0.027615,-0.009111,...,0,0,0,0,0,0,0,0,0,0
108,0.126395,-0.000838,12.185547,3.469630,8.715917,4.780036,0.058712,-0.019422,-0.012708,0.017715,...,0,0,0,0,0,0,0,0,255,0
80,0.005848,-0.019555,-1.247879,7.960186,-9.208065,-7.418771,0.006311,0.012156,-0.007190,-0.002546,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,0.055898,0.012231,6.811015,-0.778397,7.589412,2.528593,0.021272,-0.014404,-0.001778,0.033474,...,0,0,0,0,0,0,0,0,0,0
947,0.009229,-0.021085,-1.639563,-1.936270,0.296706,-0.546009,0.011564,-0.012593,-0.069562,-0.042920,...,0,0,0,0,0,0,0,1,0,0
465,0.000643,0.054235,6.836000,-6.515616,13.351616,8.553518,0.001903,-0.000857,0.108363,0.015492,...,0,0,0,0,0,0,0,0,255,0
829,-0.124139,-0.034062,-15.593936,-6.490539,-9.103397,-0.273377,-0.071892,0.042400,0.018895,0.076976,...,0,0,0,0,0,0,0,0,0,0


In [2]:
def create_pipe(X_train, clf, clf_label='clf'):
    """
    https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf
    Returns an sklearn model pipeline.
    """
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_train.select_dtypes(include=['object']).columns
    preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      (clf_label, clf)])
    return pipe
    
def clfy_report(clf, X_train, X_test, y_train, y_test, param_grid, clf_label='clf', cv=10):
    """
    Tune classifier hyperparameters and print metrics.
    """

    # Create pipeline steps for encoding categorical variables, feature scaling and normalization 
    pipe = create_pipe(X_train, clf, clf_label)
    
    # Instantiate grid search using 10-fold cross validation:
    # Learn relationship between predictors (basketball/tourney features) and outcome,
    # and the best parameters for defining such:
    search = RandomizedSearchCV(pipe, param_grid, cv=cv, n_iter=3).fit(X_train, y_train)



    
    # Predictions on the test set, new data that haven't been introduced to the model:
    predicted = search.predict(X_test)
    
    # Predictions as probabilities:
    probabilities = search.predict_proba(X_test)[:, 1]
    
    # Accuracy scores for the training and test sets:
    train_accuracy = search.score(X_train, y_train)
    test_accuracy = search.score(X_test, y_test)

    print('Best Parameters: {}\n'.format(search.best_params_))
    print('Training Accuracy: {:0.2}'.format(train_accuracy))
    print('Test Accuracy: {:0.2}\n'.format(test_accuracy))
    
    # Confusion matrix labels:
    labels = np.array([['true losses','false wins'], ['false losses','true wins']])
    
    # Model evaluation metrics:
    confusion_mtrx = confusion_matrix(y_test, predicted)
    auc = roc_auc_score(y_test, probabilities)
    fpr, tpr, thresholds = roc_curve(y_test, probabilities)
    logloss = log_loss(y_test, search.predict_proba(X_test))
    
    # Plot all metrics in a grid of subplots:
    fig = plt.figure(figsize=(12, 12))
    grid = plt.GridSpec(2, 4, wspace=0.75, hspace=0.5)
    
    # Top-left plot - confusion matrix:
    plt.subplot(grid[0, :2])
    sns.heatmap(confusion_mtrx, annot=True, fmt="d") #, fmt='')
    plt.xlabel('Predicted Games')
    plt.ylabel('Actual Games');
    
    # Top-right plot - ROC curve:
    plt.subplot(grid[0, 2:])
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('AUROC: {:0.3}'.format(auc));
    
    # Bottom-left plot - support, or true predictions:
    plt.subplot(grid[1, :2])
    sns.countplot(y=predicted, orient='h')
    plt.yticks([1, 0], ('wins', 'losses'))
    plt.ylabel(''), plt.xlabel('Number Predicted');
    
    # Bottom-right plot - classification report:
    plt.subplot(grid[1, 2:])
    visualizer = ClassificationReport(search, classes=['losses', 'wins'])
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    g = visualizer.poof();

In [None]:
classifiers = [
    SVC(probability=True),
    RandomForestClassifier(),
    LogisticRegression(),
    MLPClassifier(max_iter=500), # to allow gaurenteed convergence. 
    GaussianNB(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier()
    ]


# clf__kernel
params=[
    { # SVC
        'clf__kernel': ['rbf', 'linear', 'sigmoid'],
        'clf__C': np.logspace(start=-10, stop=10, num=21) # default 1.0
    },
    { # RandomForestClassifier
        'clf__n_estimators': np.array(np.linspace(100,200,10, endpoint=True) , dtype=np.int32), # default 100
        'clf__max_features': ['sqrt', 'log2'] # The lower the greater the reduction of variance, but also the greater the increase in bias.
    },
    { # LogisticRegression
        'clf__C': np.logspace(start=-10, stop=10, num=21),
        'clf__penalty': ['none', 'l2']
    },
    { # Neural network multi-layered perceptron, MLPClassifier
      'clf__hidden_layer_sizes': tuple(map(tuple, np.random.randint(low=5, high=20, size=(10, 3)))) # from 5-20 nodes per 3 layers, 10 iterations
    },
    { # GaussianNB
        'clf__var_smoothing': [1e-8, 1e-9, 1e-10]
    },
    { # AdaBoostClassifier
        'clf__n_estimators': np.array(np.linspace(100,200,10, endpoint=True) , dtype=np.int32), # default 100
        'clf__learning_rate': np.linspace(0.5, 1.5, 10, endpoint=True) # default 1.0
    },
    { # GradientBoostingClassifier
        'clf__n_estimators': np.array(np.linspace(100,200,10, endpoint=True) , dtype=np.int32), # default 100
        'clf__learning_rate': np.linspace(0.02, 0.18, 9, endpoint=True) # default 0.1
    },
    { # XGBClassifier
#     'clf__learning_rate': np.logspace(start=0.01, stop=0.2, num=10, endpoint = True), # see last example in np.logspace documentation
    'clf__max_depth': [2,3,4,5],
    'clf__booster': ['gbtree', 'gblinear', 'dart']
    }
]

for i, classifier in enumerate(classifiers):
    print(classifier)
    clfy_report(classifier, X_train, X_test, y_train, y_test, param_grid=params[i], cv=20)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


# Ensemble Methods: VotingClassifier, StackingClassifier. 

Reminder: High bias implies an underfit model, high variance implies an overfit model. 

## First, a bagging method using a VotingClassifier will be used to reduce the amount of variance in the model. Then, a boosting method using a StackingClassifier will be run comparatively to reduce bias. 

- ensemble averaging methods (VotingClassifier): average predictions to reduce variance. 
- ensemble boosting methods (StackingClassifier): base estimators are built sequentially and one tries to reduce the bias of the combined estimator.
- Bagging methods work best with strong and complex models (e.g. where the data points are unpredictable), in contrast with boosting methods which usually work best with weak models (e.g., shallow decision trees).
- Model with high bias pays very little attention to the training data and oversimplifies the model.
- High variance models pays attention to training data and does not generalize on the data which it hasn’t seen before.



# StackingClassifier/VotingClassifier Pipeline

In [None]:
def clfy_report_ensemble(ensemble_voting, X, y, cv=10):
    """
    Using an ensemble classifier, generate cross validated results. 
    Using StratifiedKFold to ensure that the classes are balanced equally in both
        training and testing, since some splits may be imbalanced. 
    """
    count_train_accuracy=0
    count_test_accuracy=0
    count_auc=0
    count_logloss=0
    count_fold=0
    
    skf=StratifiedKFold(n_splits=cv, random_state=None, shuffle=False)
    for train_index, test_index in skf.split(X, y):
        count_fold+=1
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ensemble_voting.fit(X_train, y_train)

        predicted = ensemble_voting.predict(X_test)

        # Predictions as probabilities:
        probabilities = ensemble_voting.predict_proba(X_test)[:, 1]

        # Accuracy scores for the training and test sets:
        train_accuracy = ensemble_voting.score(X_train, y_train)
        test_accuracy = ensemble_voting.score(X_test, y_test)

        print('For fold: {}'.format(count_fold))
        print('Training Accuracy: {:0.2}'.format(train_accuracy))
        print('Test Accuracy: {:0.2}'.format(test_accuracy))

        # Confusion matrix labels:
        labels = np.array([['true losses','false wins'], ['false losses','true wins']])

        # Model evaluation metrics:
        confusion_mtrx = confusion_matrix(y_test, predicted)
        auc = roc_auc_score(y_test, probabilities)
        print('AUC: {:0.2}'.format(auc))
        fpr, tpr, thresholds = roc_curve(y_test, probabilities)
        logloss = log_loss(y_test, ensemble_voting.predict_proba(X_test))
        print('Logloss: {:0.2}\n'.format(logloss))
    
        # Update final counts.
        count_train_accuracy+=train_accuracy
        count_test_accuracy+=test_accuracy
        count_auc+=auc
        count_logloss+=logloss
    
    print('____________________________________')
    print('Final Cross-Validation Results:')
    print('____________________________________')
    print('Training Accuracy: {:0.2}'.format(count_train_accuracy/cv))
    print('Test Accuracy: {:0.2}'.format(count_test_accuracy/cv))
    print('AUC: {:0.2}'.format(count_auc/cv))
    print('Logloss: {:0.2}\n'.format(count_logloss/cv))

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

# From hyperparameter tuning section above:
# Best Parameters: {'clf__kernel': 'sigmoid', 'clf__C': 100000000.0}
# Best Parameters: {'clf__n_estimators': 166, 'clf__max_features': 'sqrt'}
# Best Parameters: {'clf__penalty': 'l2', 'clf__C': 10000000.0}
# Best Parameters: {'clf__hidden_layer_sizes': (10, 19, 8)}
# Best Parameters: {'clf__var_smoothing': 1e-08}
# Best Parameters: {'clf__n_estimators': 177, 'clf__learning_rate': 0.7222222222222222}
# Best Parameters: {'clf__n_estimators': 188, 'clf__learning_rate': 0.04}
# Best Parameters: {'clf__max_depth': 4, 'clf__booster': 'gblinear'}
classifiers = [
    SVC(probability=True, kernel='sigmoid', C=100000000.0),         
    RandomForestClassifier(n_estimators=166, max_features='sqrt'),      
    LogisticRegression(penalty='l2', C=10000000.0),          
    MLPClassifier(max_iter=500, hidden_layer_sizes=(10, 19, 8)),   
    GaussianNB(var_smoothing=1e-08),                  
    AdaBoostClassifier(n_estimators=177, learning_rate=0.72),          
    GradientBoostingClassifier(n_estimators=188, learning_rate=0.04), 
    XGBClassifier(max_depth=4, booster='gblinear')             
    ]

pipeline_classifiers=[]
for i in range(0,len(classifiers)):
    pipeline_classifiers.append(Pipeline (steps=[('preprocessor', preprocessor),
        ('clf'+str(i+1), classifiers[i])]))
del classifiers
    
from sklearn.ensemble import VotingClassifier
ensemble_voting = Pipeline(steps=[('ensemble', VotingClassifier(estimators=[
    ('svc', pipeline_classifiers[0]), 
    ('rdf' , pipeline_classifiers[1]),
    ('lgr' , pipeline_classifiers[2]),
    ('mlp', pipeline_classifiers[3]),
    ('gau', pipeline_classifiers[4]),
    ('ada', pipeline_classifiers[5]), 
    ('gbt', pipeline_classifiers[6]),
    ('xgb', pipeline_classifiers[7])], 
                                voting='soft', 
                                # weights = [1,2,3], 
                                n_jobs=-1))])

# import pprint as pp
# pp.pprint(sorted(ensemble_voting.get_params().keys())) # used to specify ensemble params, below. 
clfy_report_ensemble(ensemble_voting, X, y, cv=10)

In [None]:
classifiers = [
    ('svc',  SVC(probability=True, kernel='rbf', C=1.0         )),
    ('rdf' , RandomForestClassifier(n_estimators=144, max_features='log2'      )),
    ('lgr' , LogisticRegression(penalty='l2', C=10000.0          )),
    ('mlp',  MLPClassifier(max_iter=500, hidden_layer_sizes=(5, 18, 8)   )),
    ('gau',  GaussianNB(var_smoothing=1e-08)),                  
    ('ada',  AdaBoostClassifier(n_estimators=177, learning_rate=0.83          )),
    ('gbt',  GradientBoostingClassifier(n_estimators=144, learning_rate=0.04) ),
    ('xgb',  XGBClassifier(max_depth=4, booster='gblinear')                      )
    ]

from sklearn.ensemble import StackingClassifier
stack_clf = StackingClassifier(estimators=classifiers, final_estimator=LogisticRegression())
pipe_stack_clf = Pipeline(steps=[('preproc', preprocessor), ('stack', stack_clf)])
clfy_report_ensemble(pipe_stack_clf, X, y, cv=10)

In [None]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        stack_clf, np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), 
        loss='0-1_loss')

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

In [None]:
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
#         ensemble_voting[0], np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), 
    VotingClassifier(estimators=[
        classifiers[0], 
        classifiers[1],
        classifiers[2],
        classifiers[3],
        classifiers[4],
        classifiers[5], 
        classifiers[6],
        classifiers[7]], 
                        voting='soft', 
                        # weights = [1,2,3], 
                        n_jobs=-1),
        np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), 
        loss='0-1_loss')

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

# Results:

#### From these results, the both of the VotingClassifier and StackingClassifier models built have:
- very low variance  (~0.10 - 0.14)
- low-moderate loss (~0.31 - 0.34)
- low-moderate bias (~0.30 - 0.32)  </ul>
The results above implies that the final model needs to use the StackingClassifier, to lower bias. However, the 10-fold cross-validated accuracy is around the same for the VotingClassifier (Training Accuracy: 0.82, Test Accuracy: 0.69) compared to the StackingClassifier (Training Accuracy: 0.64, Test Accuracy: 0.7). 
### Therefore, either VotingClassifier or StackingClassifier could be used as the primary prediction model.

# Create a bracket using both methods. 

In [None]:
df_predict = pd.read_csv('../2019Data/SampleSubmissionStage2.csv')
df_predict.head()

In [None]:
def get_year_team1_team2(ID):
    """Return a tuple with the year, team1 and team2
    for each ID in the sample submission file of possible matches."""
    return (int(x) for x in ID.split('_'))

In [None]:
df_features = pd.read_csv('../2019Data/df_features.csv')
diff = []
data = []

for i, row in df_predict.iterrows():

    year, team1, team2 = get_year_team1_team2(row.ID)

    # Save 2018 stats/features for the first ID:
    team1 = df_features[(df_features['Season'] == year) & (df_features['TeamID'] == team1)].values[0]

    # Save 2018 stats/features for the first ID:
    team2 = df_features[(df_features['Season'] == year) & (df_features['TeamID'] == team2)].values[0]

    diff = team1 - team2

    data.append(diff)

n_poss_games = len(df_predict)
columns = df_features.columns.get_values()
final_predictions = pd.DataFrame(np.array(data).reshape(n_poss_games, np.array(data).shape[1]), columns=(columns))
final_predictions.drop(['Season', 'TeamID'], inplace=True, axis=1)

# StackingClassifier Bracket Predictions

In [None]:
pipe_stack_clf.fit(X,y)
print("StackingClassifier fit on data.")
predictions = pipe_stack_clf.predict_proba(final_predictions)[:, 1]
clipped_predictions = np.clip(predictions, 0.05, 0.95)
df_predict.Pred = clipped_predictions
df_predict.to_csv('search_stk.csv', index=False)

from bracketeer import build_bracket
b = build_bracket(
        outputPath='search_stk.png', # in /Ryan
        submissionPath='search_stk.csv',
        teamsPath='../2019Data/Stage2DataFiles/Teams.csv',
        seedsPath='../2019Data/Stage2DataFiles/NCAATourneySeeds.csv',
        slotsPath='../2019Data/Stage2DataFiles/NCAATourneySlots.csv',
        year=2019
)
from IPython.display import Image
Image(filename='search_stk.png')

# VotingClassifier Bracket Predictions

In [None]:
ensemble_voting.fit(X,y)
print("VotingClassifier fit on data.")
predictions = ensemble_voting.predict_proba(final_predictions)[:, 1]
clipped_predictions = np.clip(predictions, 0.05, 0.95)
df_predict.Pred = clipped_predictions
df_predict.to_csv('search_voting.csv', index=False)

from bracketeer import build_bracket
b = build_bracket(
        outputPath='search_voting.png', # in /Ryan
        submissionPath='search_voting.csv',
        teamsPath='../2019Data/Stage2DataFiles/Teams.csv',
        seedsPath='../2019Data/Stage2DataFiles/NCAATourneySeeds.csv',
        slotsPath='../2019Data/Stage2DataFiles/NCAATourneySlots.csv',
        year=2019
)
from IPython.display import Image
Image(filename='search_ens.png')