In [None]:
%matplotlib inline

# General libraries
import pandas as pd
import numpy as np
import os
import copy
import warnings
import statsmodels.api as sm
from scipy import stats

# Plotting and printing libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patches as mpatches
import pprint
# from matplotlib.pyplot import figure, imshow, axis
from matplotlib.image import imread
from matplotlib.figure import figaspect


# Model-building libraries
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import normalize, MinMaxScaler, StandardScaler, RobustScaler, Normalizer, scale

# SK-learn libraries for learning
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC

# SK-learn libraries for evaluation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

In [None]:
label_names = {'1':'Spruce/Fir', 
               '2':'Lodgepole Pine', 
               '3':'Ponderosa Pine', 
               '4':'Cottonwood/Willow', 
               '5':'Aspen',
               '6':'Douglas Fir',
               '7':'Krummholz'}
'''Plotting helpers'''

color_dict = {1: '#A7C6ED', 2: '#BA0C2F', 3: '#651D32', 4: '#8C8985',
              5: '#212721', 6: '#002F6C', 7: '#FFC000'}

cat_1 = mpatches.Patch(color=color_dict[1], label=label_names['1'])
cat_2 = mpatches.Patch(color=color_dict[2], label=label_names['2'])
cat_3 = mpatches.Patch(color=color_dict[3], label=label_names['3'])
cat_4 = mpatches.Patch(color=color_dict[4], label=label_names['4'])
cat_5 = mpatches.Patch(color=color_dict[5], label=label_names['5'])
cat_6 = mpatches.Patch(color=color_dict[6], label=label_names['6'])
cat_7 = mpatches.Patch(color=color_dict[7], label=label_names['7'])


In [None]:
def apply_scaler(scaler, data):
    scaled_data = scaler.fit_transform(data)
    # Quick look to see if it behaved as expected
    scaled_df = pd.DataFrame(data=scaled_data,    # values
                         columns=data.columns)  # 1st row as the column names
    return scaled_df

In [None]:
def cross_validate_model(model, X, y, name="model", folds=5, verbose=False):
    '''Takes an sklearn or similar model, an X feature set, and a y label set.  
    It performs crossvalidation across n folds and prints the results.'''
    with warnings.catch_warnings(record=False):
        np.random.seed(10)

        reportFields = {'precision': [], 'recall': [], 'f1-score': []}
        generalReport = {'micro avg': copy.deepcopy(reportFields), 
                         'macro avg': copy.deepcopy(reportFields), 
                         'weighted avg':copy.deepcopy(reportFields)}
        for key in label_names:
            generalReport[key] = copy.deepcopy(reportFields)

        skf = StratifiedKFold(n_splits=folds, shuffle=True)
        for train_indexes, valid_indexes in skf.split(X, y):
            foldXTrain, foldYTrain = X.iloc[train_indexes], y.iloc[train_indexes]
            foldXValid, foldYValid = X.iloc[valid_indexes], y.iloc[valid_indexes] 

            model.fit(foldXTrain, foldYTrain)
            foldValidPred = model.predict(foldXValid)
            foldReport = metrics.classification_report(foldValidPred, foldYValid, output_dict=True)

            for key in foldReport:
                for outputField in reportFields:
                    generalReport[key][outputField].append(foldReport[key][outputField])
        title = f'Model: {name}'
        if verbose: title += f', with {folds} folds' 
        print(title)

        fields = sorted(generalReport.keys()) if verbose else ['weighted avg']
        fieldLabels = [label_names[field] if field in label_names.keys() else field for field in fields]

        for i in range(len(fields)):
            output = f'\t\t{fieldLabels[i]:<20} | '
            for outputField in reportFields:
                output += f'{outputField}: {np.mean(generalReport[fields[i]][outputField]):>5.2f} | '
            print(output)
        if verbose: print()

In [None]:
def test_model(model, X_train, y_train, X_test, y_test, name="model", verbose=False):
    with warnings.catch_warnings(record=False):
        model.fit(X_train, y_train)
        testPrediction = model.predict(X_test)
        testReport = metrics.classification_report(testPrediction, y_test, output_dict=True)

        print(f'Model: {name}')
        reportFields = ['precision', 'recall', 'f1-score']
        fields = sorted(testReport.keys()) if verbose else ['weighted avg']
        fieldLabels = [label_names[field] if field in label_names.keys() else field for field in fields]
        fieldLabels[-1] = "Final"
        for i in range(len(fields)):
            output = f'\t\t{fieldLabels[i]:<20} | '
            for outputField in reportFields:
                output += f'{outputField}: {np.mean(testReport[fields[i]][outputField]):>5.2f} | '
            print(output)
        if verbose: print()
    

In [None]:
def show_photos(list_of_files, multiplier):
    w, h = figaspect(0.5) * multiplier
    file_ct = len(list_of_files)
    fig = plt.figure(figsize=(w,h))
    for i in range(file_ct):
        a = fig.add_subplot(1, file_ct, i + 1)
        image = imread(list_of_files[i])
        plt.imshow(image)
        plt.axis('off')

In [None]:
# show_photos(list_of_files = ['imgs/1_blue-spruce-tree.jpg',
#                  'imgs/1_engelmann-spruce.jpg',
#                 'imgs/1_subalpine-fir.jpg',
#                 'imgs/1_white-fir-tree.jpg'], multiplier=4)

In [None]:
def get_ranges(df, label_column, range_column, verbose=False):
    low_end = []
    high_end = []
    middle = []
    labels = sorted(df[label_column].unique())
    if verbose:
        print(f'Labels: {labels}')
    for label in labels:
        low_end.append(df[range_column][df[label_column]==label].min())
        high_end.append(df[range_column][df[label_column]==label].max())
        middle.append(df[range_column][df[label_column]==label].mean())
    return np.asarray(low_end), np.asarray(high_end), np.asarray(middle)

In [None]:
def get_quartiles(df, label_column, range_column, verbose=False):
    q1 = []
    q3 = []
    q2 = []
    labels = sorted(df[label_column].unique())
    if verbose:
        print(f'Labels: {labels}')
    for label in labels:
        q1.append(df[range_column][df[label_column]==label].quantile(q=0.25))
        q2.append(df[range_column][df[label_column]==label].quantile(q=0.5))
        q3.append(df[range_column][df[label_column]==label].quantile(q=0.75))
        
    return np.asarray(q1), np.asarray(q3), np.asarray(q2)

In [None]:
def make_range_chart(list_of_lists, color_mapping_dict, title_text_dict,
                     aspect_ratio, share_y=True,
                     category_ct=7, verbose=False,
                     save=None):
    if verbose:
        print(f'Length of list of lists: {len(list_of_lists)}')
    w, h = figaspect(aspect_ratio)
    plt.figure(figsize=(w,h))
    
    if len(list_of_lists) > 1:
        fig, axs = plt.subplots(ncols=len(list_of_lists), nrows=1, sharey=share_y)
        i = 0
        for data_list in list_of_lists:
            base = np.asarray(data_list[0] - 0)
            middle = np.asarray(data_list[1] - data_list[0])
            top = int(math.ceil(data_list[1].max() / 100.0)) * 100
            ceiling = np.asarray(top - data_list[1])
            mean = np.asarray(data_list[2])

            ind = range(1, category_ct + 1)
            width = 0.4
            coloration = [color_mapping_dict[x] for x in ind]

            if len(list_of_lists) > 1:
                if verbose:
                    print(f'Selecting subplot {i}')
                plt.sca(axs[i])
            else:
                if verbose:
                    print(f'Selecting only subplot ({i})')
                plt.sca(0)
            p1 = plt.bar(ind, base, width, color='w')
            p2 = plt.bar(ind, middle, width, bottom=base, color=coloration)
            p3 = plt.bar(ind, ceiling, width, bottom=(base+middle), color='w')
            p4 = plt.bar(ind, 3, bottom=mean, color='g')
            plt.title(title_text_dict[i])
            i += 1
    else:
        fig, axs = plt.subplots(ncols=len(list_of_lists), nrows=1, sharey=share_y)
        data_list = list_of_lists[0]
        base = np.asarray(data_list[0] - 0)
        middle = np.asarray(data_list[1] - data_list[0])
        top = int(math.ceil(data_list[1].max() / 100.0)) * 100
        ceiling = np.asarray(top - data_list[1])
        mean = np.asarray(data_list[2])

        ind = range(1, category_ct + 1)
        width = 0.4
        coloration = [color_mapping_dict[x] for x in ind]
        p1 = plt.bar(ind, base, width, color='w')
        p2 = plt.bar(ind, middle, width, bottom=base, color=coloration)
        p3 = plt.bar(ind, ceiling, width, bottom=(base+middle), color='w')
        p4 = plt.bar(ind, 1, bottom=mean, color='g')
        plt.title(title_text_dict[1])

#     fig.set_title('Title', pad=20)
    plt.subplots_adjust(top=0.75)
    fig.suptitle(title_text_dict['figtitle'], fontsize=16)
    lgd = plt.legend(handles=[cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, cat_7],
           bbox_to_anchor=(1, 1),
           bbox_transform=plt.gcf().transFigure)
    if save is not None:
        plt.savefig(save, bbox_extra_artists=(lgd,), bbox_inches='tight')
        if verbose:
            print(f'File saved to {save}')
    plt.show()  