In [1]:
####

# Code generated for "Bill of Lading" dataset registered under "Bill of Lading (PIERS)" package

# Documentation and Installations Instructions link: https://catalogue.datalake.ihsmarkit.com (please follow the links to "Documentation")

# This code is compatible with latest version of the Data Lake command line interface hosted on pypi.org: https://pypi.org/project/dli/

# To run with python interpreter (preferably using 3.x version)

####



# Import Libraries

import dli
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import matplotlib
from sklearn.linear_model import LinearRegression
import os, glob
from os import listdir
import gc
from datetime import datetime

print('#All Libaries Imported#')

#All Libaries Imported#


In [2]:
# Concat all trial output files to one dataframe
input_path = r"C:\Users\Thomas TH Chow\Desktop\Datalake\Credit Rating Modeling\Clean datasets\Export Data\Model Input Trial"
Trial_files = [file for file in listdir(input_path) if file.endswith('.xlsx')]
Imput_list = ['harm4','foreign_company_country','Max_qty_us_company_name','Max_estimated_dollarvalue_us_company_name']
for i in range (0,len(Trial_files)):
    path = os.path.join(input_path, Trial_files[i])
    Temp_df = pd.read_excel(path,index_col=[0,1])
    # Impute the column with too many zeros with the company level mean of column
    for column in Imput_list:
        Temp_df[column].replace({0:np.nan},inplace=True)
    Temp_df.fillna(Temp_df.mean(), inplace = True)
    # Remove data after 2020 jun as the bill of lading data are not available after which
    for j in range(7,12):
        try:
            Temp_df.drop(index = (2020,j), inplace = True)
        except:
            continue
    Temp_df = Temp_df.reset_index(drop = True)
    if i==0:
        Trial_df = Temp_df
    else:
        Trial_df = Trial_df.append(Temp_df, sort = False)
Trial_df = Trial_df.reset_index(drop = True)
Trial_df.drop(columns = ['foreign_company_country'], inplace = True)
# Impute the column with nan with 0
Trial_df.fillna(0, inplace = True)
Trial_df.shape

(64435, 19)

In [3]:
Trial_df[Trial_df['Default']==True].shape

(198, 19)

In [4]:
# Output correlation matrix of all columns with Default and PD to check linear correlation
corr_matrix = Trial_df.corr()
corr_matrix[["Default","PD"]].sort_values(by=['Default'], ascending = False).to_excel('Correlation Matrix_Default.xlsx')

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

Normal = ['days_since_last_qty',
          'estimated_dollarvalue_12Mvol',
          'estimated_dollarvalue_SMA12_log10', 'estimated_dollarvalue_SMA36_log10',
          'Unit_price_qty_SMA12_log10','Unit_price_qty_SMA36_log10',
          'harm4']
         
Std = ['estimated_dollarvalue_SMA36','estimated_dollarvalue_SMA12','Unit_price_qty_SMA36',
      'Unit_price_qty_SMA12','Osci_estimated_dollarvalue_SMA12_estimated_dollarvalue_SMA36']

Trial_Normal = Trial_df[Normal]
Trial_Std = Trial_df[Std]
Trial_other = Trial_df.drop(Normal+Std+["Default","PD"],axis = 1)
Trial_labels = Trial_df[["Default","PD"]].copy()

#Normalise selected columns above
Normal_pipeline = Pipeline([
('Normal_scaler', Normalizer()),
])

#Standardise selected columns above
Std_pipeline = Pipeline([
('std_scaler', StandardScaler()),
])

# Implement normalisation and standardisation with pipeline
normX = Normal_pipeline.fit_transform(Trial_Normal)
normX_df = pd.DataFrame(normX, columns=Normal)
stdX = Std_pipeline.fit_transform(Trial_Std)
stdX_df = pd.DataFrame(stdX, columns=Std)
Full_df = pd.concat([normX_df, stdX_df, Trial_other, Trial_labels], axis=1)

# Split the training sets and testing sets with random function
train_split = np.random.rand(len(Full_df)) < 0.8
Trial_train = Full_df[train_split]
Trial_test = Full_df[~train_split]
print(f'Train size:{len(Trial_train)} Test size:{len(Trial_test)}')

Train size:51454 Test size:12981


In [6]:
# Export the training datasets parquet
output_path = r"C:\Users\Thomas TH Chow\Desktop\Datalake\Credit Rating Modeling\Clean datasets\Export Data\Trial training and testing datasets"
path = os.path.join(output_path, 'Trial Training Set.parquet.gzip')
Trial_train.to_parquet(path,compression='gzip')
Trial_train[Trial_train["Default"]==True].shape

(156, 19)

In [7]:
# Export the testing datasets parquet
output_path = r"C:\Users\Thomas TH Chow\Desktop\Datalake\Credit Rating Modeling\Clean datasets\Export Data\Trial training and testing datasets"
path = os.path.join(output_path, 'Trial Testing Set.parquet.gzip')
Trial_test.to_parquet(path,compression='gzip')
Trial_test[Trial_test["Default"]==True].shape

(42, 19)

In [8]:
print(Trial_train[Trial_train["PD"]>0.1].shape)
print(Trial_test[Trial_test["PD"]>0.1].shape)

(960, 19)
(253, 19)


Finished the features engineering and Proceed to model fitting and testing

In [9]:
# Linear regression model fitting
Train = Trial_train.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
Train_labels = Trial_train["PD"].copy().to_numpy()

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(Train, Train_labels)

some_data = Trial_train.drop(["Default","PD","Ticker"],axis = 1).iloc[:5].to_numpy()
some_labels = Trial_train["PD"].iloc[:5]
print("Predictions:", lin_reg.predict(some_data))
print("Labels:", list(some_labels))

Predictions: [0.01294498 0.01371997 0.01335651 0.01330047 0.01381048]
Labels: [0.003348830118225732, 0.003527181495977083, 0.003774126680097245, 0.0037194821923342, 0.003477256957170827]


In [10]:
# Linear regression model testing and export to excel for discussion
Test = Trial_test.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
Test_labels = Trial_test["PD"].copy().to_numpy()
PD_linprediction = lin_reg.predict(Test)
Lin_result_df = pd.concat([Trial_test.drop(["Default"],axis = 1).reset_index(drop=True), pd.DataFrame(PD_linprediction, columns=["PD_linprediction"]).reset_index(drop=True)], axis = 1)
Lin_result_df.to_excel('Linregmodel Result.xlsx')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Public\Anaconda3\lib\site-packages\xlsxwriter\workbook.py", line 320, in close
    self._store_workbook()
  File "C:\Users\Public\Anaconda3\lib\site-packages\xlsxwriter\workbook.py", line 638, in _store_workbook
    raise e
  File "C:\Users\Public\Anaconda3\lib\site-packages\xlsxwriter\workbook.py", line 635, in _store_workbook
    xlsx_file = ZipFile(self.filename, "w", compression=ZIP_DEFLATED,
  File "C:\Users\Public\Anaconda3\lib\zipfile.py", line 1250, in __init__
    self.fp = io.open(file, filemode)
PermissionError: [Errno 13] Permission denied: 'Linregmodel Result.xlsx'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Public\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-62a6b68bf659>", line 6, in <module>
    Lin_result_df.to_excel('Linr

TypeError: object of type 'NoneType' has no len()

In [None]:
# Decision Tree regression fitting
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(max_depth=10)
tree_reg.fit(Train, Train_labels)
Train_labels2 = Train_labels - tree_reg.predict(Train)
tree_reg2 = DecisionTreeRegressor(max_depth=10)
tree_reg2.fit(Train, Train_labels2)

In [None]:
# Decision Tree regression model testing and export to excel for discussion
Train = Trial_train.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
Train_labels = Trial_train["PD"].copy().to_numpy()
Train_labels.shape
PD_treeprediction = sum(tree.predict(Train) for tree in (tree_reg, tree_reg2))

Tree_result_df = pd.concat([Trial_train.drop(["Default"],axis = 1).reset_index(drop=True), pd.DataFrame(PD_treeprediction, columns=["PD_treeprediction"]).reset_index(drop=True)], axis = 1)
Tree_result_df.to_excel('Treeregmodel Result.xlsx')

def Kendall_rank_Tree(Trainingdataset, Labels):
    x1 = sum(tree.predict(Trainingdataset) for tree in (tree_reg, tree_reg2)).ravel()
    x2 = Labels.ravel()
    tau, p_value = stats.kendalltau(x1, x2)
    return tau, p_value

Kendall_tau, Kendall_p_value = Kendall_rank_Tree(Train, Train_labels)
print("-----Training Sets------")
print("Kendall tau: ", Kendall_tau)
print("Kendall p value: ", Kendall_p_value)
Kendall_test_tau, Kendall_test_p_value = Kendall_rank_Tree(Test, Test_labels)
print("-----Testing Sets------")
print("Kendall tau: ", Kendall_test_tau)
print("Kendall p value: ", Kendall_test_p_value)

In [None]:
for name, score in zip(Trial_train.drop(["Default","PD","Ticker"],axis = 1).columns, tree_reg.feature_importances_):
    if score > 0.1:
        print(name, score)

In [None]:
from sklearn import tree
print(tree_reg.feature_importances_)
tree.plot_tree(tree_reg)

In [None]:
# Random Forest Forecast model fitting
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(Train, Train_labels)
# print featrue importances
for name, score in zip(Trial_train.drop(["Default","PD","Ticker"],axis = 1).columns, forest_reg.feature_importances_):
    print(name, score)

In [None]:
# Random Forest Forecast model testing and export to excel for discussion
Test = Trial_test.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
Test_labels = Trial_test["PD"].copy().to_numpy()
Test_labels.shape
PD_forestprediction = forest_reg.predict(Test)

Forest_result_df = pd.concat([Trial_test.drop(["Default"],axis = 1).reset_index(drop=True), pd.DataFrame(PD_forestprediction, columns=["PD_forestprediction"]).reset_index(drop=True)], axis = 1)
Forest_result_df.to_excel('Forestmodel Result.xlsx')

In [None]:
from sklearn.model_selection import cross_val_score
from scipy import stats

# Kendall rank correlation coefficient
def Kendall_rank(model_function, Trainingdataset, Labels):
    x1 = model_function.predict(Trainingdataset).ravel()
    x2 = Labels.ravel()
    tau, p_value = stats.kendalltau(x1, x2)
    return tau, p_value

def display_scores(model_function):
    scores = cross_val_score(model_function, Train, Train_labels,
    scoring="neg_mean_squared_error", cv=10)
    Kendall_tau, Kendall_p_value = Kendall_rank(model_function, Train, Train_labels)
    Rmse_scores = np.sqrt(-scores)
    print(f"----------{model_function}----------")
    print("Scores:", Rmse_scores)
    print("Mean:", Rmse_scores.mean())
    print("Standard deviation:", Rmse_scores.std())
    print("Kendall tau: ", Kendall_tau)
    print("Kendall p value: ", Kendall_p_value)

display_scores(tree_reg)
display_scores(lin_reg)
display_scores(forest_reg)

In [None]:
import joblib

# Saving the model parameters
def save_model(model_function):
    model_path = r"Model backup"
    path = os.path.join(model_path, f"{model_function}.pkl")
    joblib.dump(model_function, path)
    
def load_model(model_function):
    model_path = r"Model backup"
    path = os.path.join(model_path, f"{model_function}.pkl")
    my_model_loaded = joblib.load(path)
    return my_model_loaded

save_model(tree_reg)
save_model(lin_reg)
save_model(forest_reg)

In [None]:
# Trial run the Logistic Regression model, should give you a failed message
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import preprocessing
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
# encoded = lab_enc.fit_transform(trainingScores)

Binning = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
log_Train = Binning.fit_transform(Train)
print(utils.multiclass.type_of_target(log_Train))
log_reg = LogisticRegression(max_iter=1000)
log_Train_labels = Trial_train["Default"].astype(int).copy().to_numpy()
print(utils.multiclass.type_of_target(log_Train_labels))

log_reg.fit(log_Train.astype('int'), log_Train_labels.astype('int'))

def display_scores(model_function):
    scores = cross_val_score(model_function, log_Train, Train_labels,
    scoring="neg_mean_squared_error", cv=10)
    Rmse_scores = np.sqrt(-scores)
    print(f"----------{model_function}----------")
    print("Scores:", Rmse_scores)
    print("Mean:", Rmse_scores.mean())
    print("Standard deviation:", Rmse_scores.std())
    
display_scores(log_reg)

The following is the step by step implementation of a logistic regression with gradient boost

In [None]:
# A full implementation of Logisticregression with class
class Logisticregression():
    def __init__(self):
        self.weight = []
        self.lr = 0.02
        self.iters = 10000
    # Cost function of the algorithm using the absolute log distance of the predictions/output from the expected output
    def cost_function(self, features, labels, weights):

    #     Using Mean Absolute Error

    #     Features:(100,3)
    #     Labels: (100,1)
    #     Weights:(3,1)
    #     Returns 1D matrix of predictions
    #     Cost = (labels*log(predictions) + (1-labels)*log(1-predictions) ) / len(labels)

        observations = len(labels)
        predictions = self.prediction(features, weights)
        #Take the error when label=1
        class1_cost = -labels*np.log(predictions)
        #Take the error when label=0
        class2_cost = (1-labels)*np.log(1-predictions)
        #Take the sum of both costs
        cost = class1_cost - class2_cost
        #Take the average cost
        cost = cost.sum() / observations
        return cost

    def update_weights(self, features, labels, weights, lr):
    #     Vectorized Gradient Descent
    #     Features:(200, 3)
    #     Labels: (200, 1)
    #     Weights:(3, 1)
        N = len(features)
        #1 - Get Predictions
        predictions = self.prediction(features, weights)
        #2 Transpose features from (200, 3) to (3, 200)
        # So we can multiply w the (200,1)  cost matrix.
        # Returns a (3,1) matrix holding 3 partial derivatives --
        # one for each feature -- representing the aggregate
        # slope of the cost function across all observations
        gradient = np.dot(features.T,  predictions - labels)
        #3 Take the average cost derivative for each feature
        gradient /= N
        #4 - Multiply the gradient by our learning rate
        gradient *= lr
        #5 - Subtract from our weights to minimize cost
        weights -= gradient
        return weights


    def decision_boundary(self, prob):
        return 1 if prob >= .5 else 0
    
    def classify(self, predictions):
    #   input  - N element array of predictions between 0 and 1
    #   output - N element array of 0s (False) and 1s (True)
        for i in range(0,len(predictions)):
            predictions[i] = self.decision_boundary(predictions[i])
        return predictions.astype('int')

    def sigmoid(self, z):
        return 1.0 / (1 + np.exp(-z))

    def prediction(self, features, weights):
    #   Returns 1D array of probabilities
    #   that the class label == 1
        z = np.dot(features, weights)
        return self.sigmoid(z)

    def train(self, features, labels, weights, lr, iters):
        cost_history = []
        for i in range(iters):
            weights = self.update_weights(features, labels, weights, lr)

            #Calculate error for auditing purposes
            cost = self.cost_function(features, labels, weights)
            cost_history.append(cost)

            # Log Progress
            if i % 1000 == 0:
                print("iter: "+str(i) + " cost: "+str(cost))

        return weights, cost_history
    # function to fit features and get the model trained
    def fit(self, features, labels):
        weight = self.weight
        iters = self.iters
        lr = 0.05
        if len(weight) == 0:
            self.weight = np.zeros(16).astype('float')
        else:
            self.weight = weight
        
        cost = 1
        for i in range (0,10):
            weight_trial = np.random.uniform(low=-5, high=5, size=(16,)) #to be supplemented
            weight_trial, cost_history = self.train(features, labels, weight_trial, 5, iters)
            cost_trial = self.cost_function(features, labels, weight_trial)
            if cost > cost_trial:
                cost = cost_trial
                self.weight = weight_trial
        
        self.weight, cost_history = self.train(features, labels, self.weight, 5, 100000)
        return self
    
    def predict(self, features):
        weight = self.weight
        predictions = self.prediction(features, weight)
        return self.classify(predictions)

In [None]:
# Call from class Logisticregression to perform model training
log_reg_1 = Logisticregression()
log_Train_labels = Trial_train["Default"].astype(int).copy().to_numpy()
log_reg_1.fit(Train, log_Train_labels)

In [None]:
Test = Trial_test.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
log_reg_1.predict(Test)

In [None]:
Test = Trial_test.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
Test_labels = Trial_test["Default"].astype(int).copy().to_numpy()
PD_logprediction = log_reg_1.predict(Test)
log_result_df = pd.concat([Trial_test.drop(["PD"],axis = 1).reset_index(drop=True), pd.DataFrame(PD_logprediction, columns=["PD_logprediction"]).astype(bool).reset_index(drop=True)], axis = 1)
log_result_df.to_excel('Logregmodel Result.xlsx')

In [None]:
from sklearn.model_selection import cross_val_score
from scipy import stats

# Kendall rank correlation coefficient
def Kendall_rank(model_function, Trainingdataset, Labels):
    x1 = model_function.predict(Trainingdataset).ravel()
    x2 = Labels.ravel()
    tau, p_value = stats.kendalltau(x1, x2)
    return tau, p_value

# display and print the test scores of all models
def display_test_scores(model_function):
    scores = cross_val_score(model_function, Test, Test_labels,
    scoring="neg_mean_squared_error", cv=10)
    Kendall_tau, Kendall_p_value = Kendall_rank(model_function, Test, Test_labels)
    Rmse_scores = np.sqrt(-scores)
    print(f"----------{model_function}----------")
    print("Scores:", Rmse_scores)
    print("Mean:", Rmse_scores.mean())
    print("Standard deviation:", Rmse_scores.std())
    print("Kendall tau: ", Kendall_tau)
    print("Kendall p value: ", Kendall_p_value)

display_test_scores(tree_reg)
display_test_scores(lin_reg)
display_test_scores(forest_reg)

In [None]:
# Not yet finished
import matplotlib.pyplot as plt

# Plot the results of each elements for linear regression

# Indexing the list to get index for columns
List = [[i for i in range(0,len(Test))],[15 for i in range(0,len(Test))]]

plt.figure()

plt.scatter(Test[List], Test_labels, s=20, edgecolor="black",
            c="darkorange", label="data")
# change the PD_prediction by adding tree, log, e.g. PD_treeprediction to get other plots
plt.plot(Test[List], PD_prediction, color="cornflowerblue",
         label="max_depth=2", linewidth=0.1)
plt.xlabel("Features")
plt.ylabel("PD")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

In [18]:
import xgboost as xgb
train_split = np.random.rand(len(Trial_train)) < 0.8
Trial_train_1 = Trial_train[train_split]
Trial_train_2 = Trial_train[~train_split]
print(f'Train size:{len(Trial_train_1)} Train_1 size:{len(Trial_train_2)}')

xgTrain_1 = Trial_train_1.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
xgTrain_labels_1 = Trial_train_1["PD"].copy().to_numpy()
xgTrain_2 = Trial_train_2.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
xgTrain_labels_2 = Trial_train_2["PD"].copy().to_numpy()
#xgTrain_labels = np.where(Trial_train["PD"].copy().to_numpy()>0.1, 1, 0)
xgTest = Trial_test.drop(["Default","PD","Ticker"],axis = 1).to_numpy()
xgTest_labels = Trial_test["PD"].copy().to_numpy()
#xgTest_labels = np.where(Trial_test["PD"].copy().to_numpy()>0.1, 1, 0)
dtrain_1 = xgb.DMatrix(xgTrain_1, label=xgTrain_labels_1)
dtrain_2 = xgb.DMatrix(xgTrain_2, label=xgTrain_labels_2)

param = {'max_depth': 50, 'eta': 0.1, 'objective': 'rank:pairwise'}

# specify validations set to watch performance
watchlist = [  (dtrain_1, 'train'), (dtrain_2, 'eval')]
num_round = 500
bst_2 = xgb.train(param, dtrain_1, num_round, watchlist, early_stopping_rounds=100)

Train size:41247 Train_1 size:10207
[0]	train-map:0.00960	eval-map:0.00293
[1]	train-map:0.01149	eval-map:0.00255
[2]	train-map:0.01224	eval-map:0.00262
[3]	train-map:0.01247	eval-map:0.00262
[4]	train-map:0.01370	eval-map:0.00255
[5]	train-map:0.01434	eval-map:0.00266
[6]	train-map:0.01424	eval-map:0.00272
[7]	train-map:0.01512	eval-map:0.00286
[8]	train-map:0.01562	eval-map:0.00298
[9]	train-map:0.01580	eval-map:0.00301
[10]	train-map:0.01589	eval-map:0.00295
[11]	train-map:0.01625	eval-map:0.00299
[12]	train-map:0.01635	eval-map:0.00299
[13]	train-map:0.01664	eval-map:0.00309
[14]	train-map:0.01689	eval-map:0.00322
[15]	train-map:0.01692	eval-map:0.00317
[16]	train-map:0.01727	eval-map:0.00335
[17]	train-map:0.01749	eval-map:0.00345
[18]	train-map:0.01779	eval-map:0.00340
[19]	train-map:0.01789	eval-map:0.00353
[20]	train-map:0.01817	eval-map:0.00299
[21]	train-map:0.01846	eval-map:0.00302
[22]	train-map:0.01843	eval-map:0.00299
[23]	train-map:0.01856	eval-map:0.00302
[24]	train-map

[202]	train-map:0.03726	eval-map:0.00365
[203]	train-map:0.03738	eval-map:0.00366
[204]	train-map:0.03744	eval-map:0.00367
[205]	train-map:0.03780	eval-map:0.00368
[206]	train-map:0.03815	eval-map:0.00369
[207]	train-map:0.03817	eval-map:0.00369
[208]	train-map:0.03845	eval-map:0.00371
[209]	train-map:0.03853	eval-map:0.00371
[210]	train-map:0.03888	eval-map:0.00368
[211]	train-map:0.03923	eval-map:0.00369
[212]	train-map:0.03948	eval-map:0.00370
[213]	train-map:0.03966	eval-map:0.00370
[214]	train-map:0.03983	eval-map:0.00372
[215]	train-map:0.04055	eval-map:0.00371
[216]	train-map:0.04058	eval-map:0.00378
[217]	train-map:0.04056	eval-map:0.00378
[218]	train-map:0.04059	eval-map:0.00378
[219]	train-map:0.04070	eval-map:0.00377
[220]	train-map:0.04123	eval-map:0.00375
[221]	train-map:0.04097	eval-map:0.00375
[222]	train-map:0.04114	eval-map:0.00383
[223]	train-map:0.04095	eval-map:0.00388
[224]	train-map:0.04148	eval-map:0.00385
[225]	train-map:0.04216	eval-map:0.00387
[226]	train-map:

In [19]:
from sklearn.model_selection import cross_val_score
from scipy import stats
# this is prediction
def Kendall_rank_Tree(Trainingdataset, Labels):
    x1 = bst_2.predict(xgb.DMatrix(Trainingdataset, label=Labels), ntree_limit=bst_2.best_ntree_limit).ravel()
    x2 = Labels.ravel()
    tau, p_value = stats.kendalltau(x1, x2)
    return tau, p_value

Kendall_tau, Kendall_p_value = Kendall_rank_Tree(xgTrain_1, xgTrain_labels_1)
print("-----Training Sets------")
print("Kendall tau: ", Kendall_tau)
print("Kendall p value: ", Kendall_p_value)
Kendall_tau, Kendall_p_value = Kendall_rank_Tree(xgTrain_2, xgTrain_labels_2)
print("-----Training Sets_1------")
print("Kendall tau: ", Kendall_tau)
print("Kendall p value: ", Kendall_p_value)
Kendall_test_tau, Kendall_test_p_value = Kendall_rank_Tree(xgTest, xgTest_labels)
print("-----Testing Sets------")
print("Kendall tau: ", Kendall_test_tau)
print("Kendall p value: ", Kendall_test_p_value)

-----Training Sets------
Kendall tau:  0.8592945918471063
Kendall p value:  0.0
-----Training Sets_1------
Kendall tau:  0.6200395110964915
Kendall p value:  0.0
-----Testing Sets------
Kendall tau:  0.6163882832021066
Kendall p value:  0.0


In [None]:
xgb.plot_importance(bst_2)

In [21]:
# Xgboost model result and export to excel for discussion
PD_xgprediction = bst_2.predict(xgb.DMatrix(xgTrain_1, label = xgTrain_labels_1), ntree_limit=bst_2.best_ntree_limit)

Xg_result_df = pd.concat([Trial_train_1.drop(["Default"],axis = 1).reset_index(drop=True), pd.DataFrame(PD_xgprediction, columns=["PD_XGprediction"]).reset_index(drop=True)], axis = 1)
Xg_result_df.to_excel('Xgboostmodel Result.xlsx')