In [1]:
import pandas as pd
import numpy as np
import time
import copy
import pickle


from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score

In [30]:
class ShaoXing:

    def __init__(self):
        
        self._initialise_objects()

        print('ShaoXing Initialised')



    def _initialise_objects(self):
        
        self._seed = 18980305
        self.train_analyse_df = None
        self.val_analyse_df = None
        self.test_analyse_df = None



    def read_in_features_label(self, features, label):

        self.features = features
        self.label = label



    def read_in_full_train_test_data(self, full_data, train_data, val_data, test_data):
        """ Read in Full, Train, Validate, Test data along with features list and labels, auto transforms to x, y """

        if self.features == None or self.label == None:
            print("Missing features list and label. Please run .read_in_features_label before running this function")
            return

        for col in self.features:
            assert col in full_data.columns
            assert col in train_data.columns
            assert col in val_data.columns
            assert col in test_data.columns

        

        self.full_data = full_data
        print('Full Data read in successfully')
        self.train_data = train_data
        print('Train Data read in successfully')
        self.val_data = val_data
        print('Validation Data read in successfully')
        self.test_data = test_data
        print('Test Data read in successfully')


        self.train_x, self.train_y = self._separate_feature_label(train_data)
        self.val_x, self.val_y = self._separate_feature_label(val_data)
        self.test_x, self.test_y = self._separate_feature_label(test_data)
    


    def _separate_feature_label(self, data):
        return data[self.features], data[self.label]



    def read_in_future_data(self, future_data):
        """ Read in Train Test Split data """

        assert len(future_data.columns) in (len(self.features), len(self.features)+ 1)

        for col in self.features:
            assert col in future_data.columns
        
        self.future_pred = None # reset future predictions given new data
        self.future_data_and_pred = None

        self.future_data = future_data
        print('Future Data read in successfully')


    
    def read_in_untrained_model(self, model_class, best_params, type):
        """ Reads in underlying model object for tuning, and also read in what type of model it is """

        assert type == 'Classification' or type == 'Regression' # check

        # record
        self.model_fitted = 0
        self.model_class = model_class
        self.clf_type = type 
        self.parameters = best_params

        print(f'Successfully read in untrained model {self.model_class} with hyperparameters {self.parameters}, which is a {self.clf_type} model')
    

    
    def read_in_fitted_model(self, fitted_model, type):

        assert type == 'Classification' or type == 'Regression' # check
        
        self.model = fitted_model
        self.clf_type = type
        
        self.model_fitted = 1

        print(f'Successfully read in fitted model {self.model}, which is a {self.clf_type} model')


    
    def fit_model(self):

        if self.model_class == None or self.parameters == None:
            print('Missing model_class or parameters, please run .read_in_untrained_model()')

        start = time.time()
        self.model = self.model_class(**self.parameters)
        self.model.fit(self.train_x, self.train_y)
        end = time.time()

        self.model_fit_time = end - start

        self.model_fitted = 1
        print(f'Model fitted, taking {self.model_fit_time} seconds')



    def export_model(self, model_export_address):

        if self.model_fitted == 0:
            print('Please fit model using .fit_model() before exporting')
            return
        
        model_export_address_strip = model_export_address.split('.pickle')[0]

        with open(f'{model_export_address_strip}.pickle', 'wb') as f:
            pickle.dump(self.model, f)
    


    def predict_using_future_data(self, return_pred = False):

        if self.model_fitted == 0:
            print('Model not fitted, please use .read_in_fitted_model() to read in a fitted model or .fit_model() to fit model')
            return
        
        self.future_pred = self.model.predict(self.future_data)

        if return_pred:
            return self.future_pred
        

    
    def export_future_data_and_predictions(self, future_data_and_pred_saving_address):

        assert self.future_pred

        self.future_data_and_pred = copy.deepcopy(self.future_data)
        self.future_data_and_pred['Pred'] = self.future_pred

        future_data_and_pred_saving_address_strip = future_data_and_pred_saving_address.split('.csv')[0]
            
        self.tuning_result.to_csv(f'{future_data_and_pred_saving_address_strip}.csv', index=False)

    

    def view_future_data_and_predictions(self, return_df = False):

        if self.future_data_and_pred == None:
            self.future_data_and_pred = copy.deepcopy(self.future_data)
            self.future_data_and_pred['Pred'] = self.future_pred
        
        if return_df:
            return self.future_data_and_pred
        else:
            display(self.future_data_and_pred)
    
    

    def get_analysis(self):
        self.train_pred = self.model.predict(self.train_x)
        self.val_pred = self.model.predict(self.val_x)
        self.test_pred = self.model.predict(self.test_x)

        # Normal stats
        if self.clf_type == 'Regression':
            train_score = r2_score(self.train_y, self.train_pred)
            val_score = r2_score(self.val_y, self.val_pred)
            test_score = r2_score(self.test_y, self.test_pred)

            train_rmse = np.sqrt(mean_squared_error(self.train_y, self.train_pred))
            val_rmse = np.sqrt(mean_squared_error(self.val_y, self.val_pred))
            test_rmse = np.sqrt(mean_squared_error(self.test_y, self.test_pred))

            train_mape = mean_absolute_percentage_error(self.train_y, self.train_pred)
            val_mape = mean_absolute_percentage_error(self.val_y, self.val_pred)
            test_mape = mean_absolute_percentage_error(self.test_y, self.test_pred)


            # Quartile Stats:
            train_quantile4_n, train_quantile4_r2, train_quantile4_RMSE, \
                train_quantile4_MAPE, val_quantile4_n, val_quantile4_r2, \
                    val_quantile4_RMSE, val_quantile4_MAPE, test_quantile4_n, \
                        test_quantile4_r2, test_quantile4_RMSE, test_quantile4_MAPE = \
                            self._quantily_stats(4)
            
            train_quantile10_n, train_quantile10_r2, train_quantile10_RMSE, \
                train_quantile10_MAPE, val_quantile10_n, val_quantile10_r2, \
                    val_quantile10_RMSE, val_quantile10_MAPE, test_quantile10_n, \
                        test_quantile10_r2, test_quantile10_RMSE, test_quantile10_MAPE = \
                            self._quantily_stats(10)
            
            # CV
            cv_r2_score, cv_rmse_score, cv_mape_score = self._get_CV_stats()

            # Diagram
    
        elif self.clf_type == 'Classification':
            train_score = self.model.score(self.train_y, self.train_pred)
            val_score = self.model.score(self.val_y, self.val_pred)
            test_score = self.model.score(self.test_y, self.test_pred)

            train_bal_accu = balanced_accuracy_score(self.train_y, self.train_pred)
            val_bal_accu = balanced_accuracy_score(self.val_y, self.val_pred)
            test_bal_accu = balanced_accuracy_score(self.test_y, self.test_pred)

            train_f1 = f1_score(self.train_y, self.train_pred, average='weighted')
            val_f1 = f1_score(self.val_y, self.val_pred, average='weighted')
            test_f1 = f1_score(self.test_y, self.test_pred, average='weighted')

            train_precision = precision_score(self.train_y, self.train_pred, average='weighted')
            val_precision = precision_score(self.val_y, self.val_pred, average='weighted')
            test_precision = precision_score(self.test_y, self.test_pred, average='weighted')

            train_recall = recall_score(self.train_y, self.train_pred, average='weighted')
            val_recall = recall_score(self.val_y, self.val_pred, average='weighted')
            test_recall = recall_score(self.test_y, self.test_pred, average='weighted')

            #TODO: MORE STATS    

            # CV
            cv_accuracy_score, cv_bal_accuracy_score, cv_f1_score, cv_precision_score, cv_recall_score = self._get_CV_stats()

            # Diagrams



    def _quantily_stats(self, n_quantiles):
    
        if self.train_analyse_df is None:
            self.train_analyse_df = pd.DataFrame({'obs': self.train_y, 'pred': self.train_pred})
        
        if self.val_analyse_df is None:    
            self.val_analyse_df = pd.DataFrame({'obs': self.val_y, 'pred': self.val_pred})
        
        if self.test_analyse_df is None:
            self.test_analyse_df = pd.DataFrame({'obs': self.test_y, 'pred': self.test_pred})
        
        quantiles_p = [i/n_quantiles for i in range(n_quantiles+1)]

        train_quantiles = list(self.train_y.quantile(quantiles_p))
        val_quantiles = list(self.val_y.quantile(quantiles_p))
        test_quantiles = list(self.test_y.quantile(quantiles_p))

        train_quantile_n = list()
        val_quantile_n = list()
        test_quantile_n = list()

        train_quantile_r2 = list()
        val_quantile_r2 = list()
        test_quantile_r2 = list()

        train_quantile_RMSE = list()
        val_quantile_RMSE = list()
        test_quantile_RMSE = list()

        train_quantile_MAPE = list()
        val_quantile_MAPE = list()
        test_quantile_MAPE = list()

        for analyse_df, quantiles, stats in ((self.train_analyse_df, train_quantiles, (train_quantile_n, train_quantile_r2, train_quantile_RMSE, train_quantile_MAPE)), \
            (self.val_analyse_df, val_quantiles, (val_quantile_n, val_quantile_r2, val_quantile_RMSE, val_quantile_MAPE)), \
            (self.test_analyse_df, test_quantiles, (test_quantile_n, test_quantile_r2, test_quantile_RMSE, test_quantile_MAPE))):

            for i in range(n_quantiles):
                if i == 0:
                    q_df = analyse_df[(analyse_df['obs'] >= quantiles[i])
                        & (analyse_df['obs'] <= quantiles[i+1])]
                else:
                    q_df = analyse_df[(analyse_df['obs'] > quantiles[i])
                        & (analyse_df['obs'] <= quantiles[i+1])]
                
                stats[0].append(len(q_df))

                try:
                    q_r2 = r2_score(q_df['obs'], q_df['pred']) # when all values of obs is the same, r2 will always be 0
                except:
                    q_r2 = np.nan

                try:
                    q_rmse = np.sqrt(mean_squared_error(q_df['obs'], q_df['pred']))
                except:
                    q_rmse = np.nan

                try:
                    q_mape = mean_absolute_percentage_error(q_df['obs'], q_df['pred'])
                except:
                    q_mape = np.nan
                
                stats[1].append(q_r2)
                stats[2].append(q_rmse)
                stats[3].append(q_mape)

        return train_quantile_n, train_quantile_r2, train_quantile_RMSE, train_quantile_MAPE, val_quantile_n, val_quantile_r2, val_quantile_RMSE, val_quantile_MAPE, test_quantile_n, test_quantile_r2, test_quantile_RMSE, test_quantile_MAPE
    


    def _get_CV_stats(self):

        shuffled_full_data = self.full_data.sample(frac = 1, random_state = 18980305) # use sample to shuffle
        shuffled_full_data.index = range(len(shuffled_full_data))

        n = len(shuffled_full_data)
        shuffled_index = [0, int(n/5), int(2*n/5), int(3*n/5), int(4*n/5), n]
        
        if self.clf_type == 'Regression':
            r2_scores = list()
            rmse_scores = list()
            mape_scores = list()

        elif self.clf_type == 'Classification': #TODO
            accuracy_scores = list()
            bal_accuracy_scores = list()
            f1_scores = list()
            precision_scores = list()
            recall_scores = list()


        for i in range(5):
            cv_train1 = shuffled_full_data[0:shuffled_index[i]] 
            cv_train2 = shuffled_full_data[shuffled_index[i+1]:n]
            cv_train = cv_train1.append(cv_train2)
            cv_test = shuffled_full_data[shuffled_index[i]:shuffled_index[i+1]]

            cv_train_x = cv_train[self.features]
            cv_train_y = cv_train[self.label]
            cv_test_x = cv_test[self.features]
            cv_test_y = cv_test[self.label]

            cv_model = copy.deepcopy(self.model)
            cv_model.fit(cv_train_x, cv_train_y)

            cv_test_pred = cv_model.predict(cv_test_x)
            
            if self.clf_type == 'Regression':
                r2_scores.append(r2_score(cv_test_y, cv_test_pred))
                rmse_scores.append(np.sqrt(mean_squared_error(cv_test_y, cv_test_pred)))
                mape_scores.append(mean_absolute_percentage_error(cv_test_y, cv_test_pred))

            elif self.clf_type == 'Classification': #TODO
                accuracy_scores.append(cv_model.score(cv_test_y, cv_test_pred))
                bal_accuracy_scores.append(balanced_accuracy_score(cv_test_y, cv_test_pred))
                f1_scores.append(f1_score(cv_test_y, cv_test_pred))
                precision_scores.append(precision_score(cv_test_y, cv_test_pred))
                recall_scores.append(recall_score(cv_test_y, cv_test_pred))
            
        if self.clf_type == 'Regression':
            cv_r2_score = np.mean(r2_scores)
            cv_rmse_score = np.mean(rmse_scores)
            cv_mape_score = np.mean(mape_scores)

            return cv_r2_score, cv_rmse_score, cv_mape_score
            
        elif self.clf_type == 'Classification': #TODO
            cv_accuracy_score = np.mean(accuracy_scores)
            cv_bal_accuracy_score = np.mean(bal_accuracy_scores)
            cv_f1_score = np.mean(f1_scores)
            cv_precision_score = np.mean(precision_scores)
            cv_recall_score = np.mean(recall_scores)

            return cv_accuracy_score, cv_bal_accuracy_score, cv_f1_score, cv_precision_score, cv_recall_score

        

In [31]:
full_data = pd.read_csv('../data/curated/modelling/N_Full_3.csv')
train_data = pd.read_csv('../data/curated/modelling/N_Train_3.csv')
val_data = pd.read_csv('../data/curated/modelling/N_Validate_3.csv')
test_data = pd.read_csv('../data/curated/modelling/N_Test_3.csv')


In [32]:
shaoxing = ShaoXing()

ShaoXing Initialised


In [33]:
from ZhongShan import *

In [34]:
with open(f'../models/AFL_pipeline_N.pickle', 'rb') as f:
    sanmin = pickle.load(f)
    
model3_COLS = sanmin.final_features['3']
# model2_COLS = sanmin.final_features['2']
# model1_COLS = sanmin.final_features['1']

model3_COLS = [x for x in model3_COLS if x not in ['3', '2', '1']]
# model2_COLS = [x for x in model2_COLS if x not in ['3', '2', '1']]
# model1_COLS = [x for x in model1_COLS if x not in ['3', '2', '1']]

In [35]:
shaoxing.read_in_features_label(model3_COLS, '3')

In [36]:
shaoxing.read_in_full_train_test_data(full_data, train_data, val_data, test_data)

Full Data read in successfully
Train Data read in successfully
Validation Data read in successfully
Test Data read in successfully


In [37]:
from sklearn.ensemble import GradientBoostingRegressor as GBR

In [38]:
gbr = GBR
params = {'learning_rate':0.01, 
    'n_estimators':200, 
    'subsample' : 0.5, 
    'max_features':0.5, 
    'ccp_alpha':0, 
    'max_depth':5, 
    'random_state' : 19260817}

In [39]:
with open('../models/final_models/model3.pickle', 'rb') as f:
    model = pickle.load(f)

In [40]:
shaoxing.read_in_fitted_model(model, 'Regression')

Successfully read in fitted model GradientBoostingRegressor(ccp_alpha=0, learning_rate=0.01, max_depth=5,
                          max_features=0.5, n_estimators=200,
                          random_state=19260817, subsample=0.5), which is a Regression model


In [41]:
# shaoxing.read_in_untrained_model(gbr, params, 'Regression')

In [42]:
# shaoxing.fit_model()

In [43]:
shaoxing.get_analysis()

47328
11831
47327
11832
47327
11832
47327
11832
47327
11832
