# Making The Models And Getting Data
___

## Introduction
This jupyter notebook has the ability to update the Web_Get functions, get match by match data from Web_Get, aid in model selection, and has two working models for GaussianNB and XGB classifier.
***

## Updater
updater is a function that updates the data from Web_Get. This comes in handy when something changes about the calculations in Web_Get or the data from the Blue Alliance updates. This doesn't happen for this project, because all of the matches have been played. It will take about 20ish minutes to run.

In [1]:
def updater():
    import ipynb.fs.defs.Web_Get as wg
    from tqdm import tqdm
    print(dir(wg))
    events = wg.get_all_events(6, False)
    
    with tqdm(total=len(events)) as pbar:
        for event in events:
            #print(event)
            new=True
            #new=False
            #wg.get_rankings(event, new=new)
            #wg.get_matches(event, new=new)
            wg.get_first_pred(event, new=new)
            #wg.more_team_stats(event, new=new)
            wg.predict_matches(event, new=new)
            pbar.update(1)

In [2]:
#updater()

## Get Match Data
get_match_data is a function that gets the match by match data for a given set of parameters. The week parameter allow for certain weeks to be used in getting the data. The only parameter allows for just one week to be chosen, else the chosen week and all previous weeks will be used. The with_rs parameter allows for all statistics involving ranking score to be dropped from the data set.

In [3]:
def get_match_data(week=6, with_rs=True, only=False):
    import ipynb.fs.defs.Web_Get as wg
    from tqdm import tqdm
    import numpy as np

    events = wg.get_all_events(week, only)
    
    n = 0
    print("Loading data")
    with tqdm(total=len(events)) as pbar:
        for event in events:
            stats, blue_wins = wg.predict_matches(event)
            y_tmp = blue_wins
            x_tmp = stats
            if n == 0:
                X = x_tmp
                y = y_tmp
                n += 1
            else:
                X = X.append(x_tmp, ignore_index=True)
                y = np.append(y, y_tmp)
            pbar.update(1)
    if not with_rs:
        X.drop(['Ranking_Score_a', 'tba_rpEarned_OAVE', 'tba_rpEarned_CPR', 'tba_rpEarned_OPR', 'tba_rpEarned_DAVE', 'tba_rpEarned_DPR'], axis=1, inplace=True)
    return X, y

In [4]:
#X, y = get_match_data(week=1)

## Model Selection
model_selection is a function that produces data to help with model selection. There is a function in the 'visuals.py' in this same directory that visualizes the data from this function. The inputs are a set of X and y data and if PCA is to be used for the model selection. If pca is true, then it is assumed the input data has already been PCA transformed.

In [5]:
def model_selection(X, y, pca=True):
    from sklearn.metrics import fbeta_score, accuracy_score, average_precision_score
    from time import clock
    beta = 0.5
    random_state = 42
    
    if pca == True:
        from sklearn.preprocessing import MinMaxScaler
        X = MinMaxScaler([-1,1]).fit_transform(X)
    else:
        from sklearn.preprocessing import MinMaxScaler
        headers = X.columns.values
        fit_scaler = MinMaxScaler([-1,1]).fit(X[headers])
        X[headers] = fit_scaler.transform(X[headers])
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.15,
                                                        random_state=random_state)

    print("Training set has {} samples and {} features.".format(X_train.shape[0], X_train.shape[1]))
    print("Testing set has {} samples and {} features.".format(X_test.shape[0], X_train.shape[1]))
    
    results = {}
    test_preds = {}
    def store_results(classifier, ypred_train, ypred_test, time):
        evals = []
        evals.append(accuracy_score(y_train, ypred_train))
        evals.append(accuracy_score(y_test, ypred_test))
        evals.append(fbeta_score(y_test, ypred_test, beta = beta))
        evals.append(average_precision_score(y_test, ypred_test))
        evals.append(time)
        results[classifier] = evals
        
        test_preds[classifier] = ypred_test
        
    
    #XGBoost
    import xgboost as xgb
    clf = xgb.XGBClassifier(random_state=random_state)
    tic = clock()
    clf.fit(X_train, y_train)
    toc = clock()-tic
    ypred_train = clf.predict(X_train)
    ypred_test = clf.predict(X_test)
    store_results('XGB Classifier', ypred_train, ypred_test, toc)
        
    #SVC
    from sklearn.svm import SVC
    clf = SVC(random_state=random_state, cache_size=1000)
    tic = clock()
    clf.fit(X_train, y_train)
    toc = clock()-tic
    ypred_train = clf.predict(X_train)
    ypred_test = clf.predict(X_test)
    store_results('SVC', ypred_train, ypred_test, toc)
    
    #AdaBoostClassifier with DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    clf = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state=random_state), random_state=random_state)
    tic = clock()
    clf.fit(X_train, y_train)
    toc = clock()-tic
    ypred_train = clf.predict(X_train)
    ypred_test = clf.predict(X_test)
    store_results('AdaBoostClassifier', ypred_train, ypred_test, toc)
    
    #Gaussian Naive Bayes
    from sklearn.naive_bayes import GaussianNB
    clf = GaussianNB()
    tic = clock()
    clf.fit(X_train, y_train)
    toc = clock()-tic
    ypred_train = clf.predict(X_train)
    ypred_test = clf.predict(X_test)
    store_results('GaussianNB', ypred_train, ypred_test, toc)
    
    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=random_state)
    tic = clock()
    clf.fit(X_train, y_train)
    toc = clock()-tic
    ypred_train = clf.predict(X_train)
    ypred_test = clf.predict(X_test)
    store_results('Logistic Regression', ypred_train, ypred_test, toc)
    
    return results, test_preds, X_test, y_test

In [6]:
#model_selection(X, y)

## GaussianNB Model
This is the model for GaussianNB. This allows for easy customization of the model fuction.

In [7]:
def GaussianNB_model(var_smoothing, X_train, y_train):
    from sklearn.naive_bayes import GaussianNB

    clf = GaussianNB()
    clf.fit(X_train, y_train)
    return clf

## XGBoost Model
This is the model for the XGBoost classifier. This uses a grid search algorithm to find the best preforming set of hyperparameters. For the final model, only one set of parameters are input into the model, but this robust setup is very useful for hyperparameter tuning.

In [8]:
def XGBoost_model(max_depth, learning_rate, n_estimators, X_train, y_train, beta, random_state, return_grid_fit=False):
    from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
    from sklearn.metrics import fbeta_score, accuracy_score, make_scorer, average_precision_score
    import xgboost as xgb
    
    import warnings
    warnings.filterwarnings('ignore')
    
    clf = xgb.XGBClassifier(random_state=random_state)
    clf.fit(X_train, y_train)
    
    parameters = {'max_depth':max_depth, 'learning_rate':learning_rate, 'n_estimators':n_estimators}

    cv = StratifiedShuffleSplit(n_splits = 5, test_size = 0.20)

    scorer1 = make_scorer(fbeta_score, beta=beta)
    scorer2 = make_scorer(accuracy_score)
    scorer3 = make_scorer(average_precision_score)
    scoring = {'scorer1': scorer1, 'scorer2': scorer2,'scorer3': scorer3}

    grid_obj = GridSearchCV(clf, parameters, scoring=scoring, refit='scorer1', n_jobs=8, cv=cv, verbose=0)

    grid_fit = grid_obj.fit(X_train, y_train)

    best_clf = grid_fit.best_estimator_
    
    if return_grid_fit:
        return clf, best_clf, grid_fit
    else:
        return clf, best_clf

## Future Matches GaussianNB
future_matches_GNB is a function that trains the GaussianNB model on the weeks prior to the input week and tests the preformance on the input week. Thus, these are the future match predictions of the model.

In [9]:
def future_matches_GNB(week, with_rs=True, only=False, num_pca=5):
    from sklearn.metrics import fbeta_score, accuracy_score, average_precision_score
    import ipynb.fs.defs.Web_Get as wg
    from tqdm import tqdm
    import numpy as np
    import pandas as pd
    
    beta = 0.5
    random_state = 42
    
    events = wg.get_all_events(week=week-1, only=only)
    
    n = 0
    print("Loading match data to train on")
    with tqdm(total=len(events)) as pbar:
        for event in events:
            stats, blue_wins = wg.predict_matches(event)
            y_tmp = blue_wins
            x_tmp = stats
            if n == 0:
                X = x_tmp
                y = y_tmp
                n += 1
            else:
                X = X.append(x_tmp, ignore_index=True)
                y = np.append(y, y_tmp)
            pbar.update(1)
    if not with_rs:
        X.drop(['Ranking_Score_a', 'tba_rpEarned_OAVE', 'tba_rpEarned_CPR', 'tba_rpEarned_OPR', 'tba_rpEarned_DAVE', 'tba_rpEarned_DPR'], axis=1, inplace=True)
    #print(X.shape)
    #print(y.shape)
    
    from sklearn.decomposition import PCA
    pca = PCA(n_components=num_pca)
    pca.fit(X)
    X = pca.transform(X)
    
    from sklearn.preprocessing import MinMaxScaler
    X_fit = MinMaxScaler([-1,1]).fit(X)
    X = X_fit.transform(X)
    
    print("Training set has {} samples and {} features.".format(X.shape[0], X.shape[1]))
    
    clf = GaussianNB_model([1e-9], X, y)
    
    events = wg.get_all_events(week=week, only=True)
    
    n = 0
    print()
    print("Loading future data to predict on")
    with tqdm(total=len(events)) as pbar:
        for event in events:
            stats, blue_wins = wg.predict_matches(event)
            preds = wg.get_first_pred(event)
            
            y_tmp = blue_wins
            x_tmp = stats
            
            pred = np.array([1 if x=='blue' else 0 for x in preds[:,0]])
            actual = np.array([1 if x=='blue' else 0 for x in preds[:,1]])
            
            if n == 0:
                X1 = x_tmp
                y1 = y_tmp
                pred1 = pred
                actual1 = actual
                n += 1
            else:
                X1 = X1.append(x_tmp, ignore_index=True)
                y1 = np.append(y1, y_tmp)
                pred1 = np.append(pred1, pred)
                actual1 = np.append(actual1, actual)
            pbar.update(1)

    if not with_rs:
        X1.drop(['Ranking_Score_a', 'tba_rpEarned_OAVE', 'tba_rpEarned_CPR', 'tba_rpEarned_OPR', 'tba_rpEarned_DAVE', 'tba_rpEarned_DPR'], axis=1, inplace=True)
    X1 = pca.transform(X1)
    X1 = X_fit.transform(X1)

    predictions = clf.predict(X1)
    
    index = ['model_accuracy_score','model_fbeta_score','model_average_precision_score','first_accuracy_score','first_fbeta_score','first_average_precision_score','training_set_size','testing_set_size']

    raw_results = [accuracy_score(y1, predictions), fbeta_score(y1, predictions, beta = beta), average_precision_score(y1, predictions), accuracy_score(actual1, pred1), fbeta_score(actual1, pred1, beta = beta), average_precision_score(actual1, pred1),X.shape[0],X1.shape[0]]
    
    results = pd.DataFrame(raw_results, index=index)
    
    print(accuracy_score(y1, predictions))
    print(fbeta_score(y1, predictions, beta = beta))
    print(average_precision_score(y1, predictions))
    print()
    print(accuracy_score(actual1, pred1))
    print(fbeta_score(actual1, pred1, beta = beta))
    print(average_precision_score(actual1, pred1))
    
    mine_real = np.equal(predictions, y1)
    first_real = np.equal(pred1, y1)
    
    both_wrong = 0
    both_right = 0
    mine_right = 0
    first_right = 0
    for i, j in zip(mine_real, first_real):
        if i == False and j == False:
            both_wrong += 1
        elif i == True and j == True:
            both_right += 1
        elif i == True and j == False:
            mine_right += 1
        elif i == False and j == True:
            first_right += 1
            
    return both_right, both_wrong, mine_right, first_right, X1, predictions, pred1, y1, results

In [10]:
#future_matches_GNB(2, False)

## Future Matches XGB
future_matches_XGB is a function that trains the GaussianNB model on the weeks prior to the input week and tests the preformance on the input week. Thus, these are the future match predictions of the model.

In [11]:
def future_matches_XGB(week, with_rs=True, only=False):
    from sklearn.metrics import fbeta_score, accuracy_score, average_precision_score
    import ipynb.fs.defs.Web_Get as wg
    from tqdm import tqdm
    import numpy as np
    import pandas as pd
    
    beta = 0.5
    random_state = 42
    
    events = wg.get_all_events(week=week-1, only=only)
    
    n = 0
    print("Loading match data to train on")
    with tqdm(total=len(events)) as pbar:
        for event in events:
            stats, blue_wins = wg.predict_matches(event)
            y_tmp = blue_wins
            x_tmp = stats
            if n == 0:
                X = x_tmp
                y = y_tmp
                n += 1
            else:
                X = X.append(x_tmp, ignore_index=True)
                y = np.append(y, y_tmp)
            pbar.update(1)
    if not with_rs:
        X.drop(['Ranking_Score_a', 'tba_rpEarned_OAVE', 'tba_rpEarned_CPR', 'tba_rpEarned_OPR', 'tba_rpEarned_DAVE', 'tba_rpEarned_DPR'], axis=1, inplace=True)
    #print(X.shape)
    #print(y.shape)
    
    from sklearn.preprocessing import MinMaxScaler
    headers = X.columns.values
    fit_scaler = MinMaxScaler([-1,1]).fit(X[headers])
    X[headers] = fit_scaler.transform(X[headers])
    
    print("Training set has {} samples and {} features.".format(X.shape[0], X.shape[1]))

    clf, best_clf = XGBoost_model([3], [0.1], [100], X, y, beta, random_state)
    
    events = wg.get_all_events(week=week, only=True)
    
    n = 0
    print()
    print("Loading future data to predict on")
    with tqdm(total=len(events)) as pbar:
        for event in events:
            stats, blue_wins = wg.predict_matches(event)
            preds = wg.get_first_pred(event)
            
            y_tmp = blue_wins
            x_tmp = stats
            
            pred = np.array([1 if x=='blue' else 0 for x in preds[:,0]])
            actual = np.array([1 if x=='blue' else 0 for x in preds[:,1]])
            
            if n == 0:
                X1 = x_tmp
                y1 = y_tmp
                pred1 = pred
                actual1 = actual
                n += 1
            else:
                X1 = X1.append(x_tmp, ignore_index=True)
                y1 = np.append(y1, y_tmp)
                pred1 = np.append(pred1, pred)
                actual1 = np.append(actual1, actual)
            pbar.update(1)

    if not with_rs:
        X1.drop(['Ranking_Score_a', 'tba_rpEarned_OAVE', 'tba_rpEarned_CPR', 'tba_rpEarned_OPR', 'tba_rpEarned_DAVE', 'tba_rpEarned_DPR'], axis=1, inplace=True)

    headers = X1.columns.values
    X1[headers] = fit_scaler.transform(X1[headers])

    predictions = best_clf.predict(X1)
    
    index = ['model_accuracy_score','model_fbeta_score','model_average_precision_score','first_accuracy_score','first_fbeta_score','first_average_precision_score','training_set_size','testing_set_size']

    raw_results = [accuracy_score(y1, predictions), fbeta_score(y1, predictions, beta = beta), average_precision_score(y1, predictions), accuracy_score(actual1, pred1), fbeta_score(actual1, pred1, beta = beta), average_precision_score(actual1, pred1),X.shape[0],X1.shape[0]]
    
    results = pd.DataFrame(raw_results, index=index)

    print(accuracy_score(y1, predictions))
    print(fbeta_score(y1, predictions, beta = beta))
    print(average_precision_score(y1, predictions))
    print()
    print(accuracy_score(actual1, pred1))
    print(fbeta_score(actual1, pred1, beta = beta))
    print(average_precision_score(actual1, pred1))
    
    mine_real = np.equal(predictions, y1)
    first_real = np.equal(pred1, y1)
    
    both_wrong = 0
    both_right = 0
    mine_right = 0
    first_right = 0
    for i, j in zip(mine_real, first_real):
        if i == False and j == False:
            both_wrong += 1
        elif i == True and j == True:
            both_right += 1
        elif i == True and j == False:
            mine_right += 1
        elif i == False and j == True:
            first_right += 1
            
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    pca.fit(X1)
    X1_pca = pca.transform(X1)
            
    return both_right, both_wrong, mine_right, first_right, X1_pca, predictions, pred1, y1, best_clf, results

In [12]:
#future_matches_XGB(2, False)