## Load Training / Testing Data

In [1]:
import pickle
with open("A:\\Files\\Shares\\Downloads\\yelp_dataset\\yelp_dataset~\\TrainTestData.dat", "rb") as filePath:
    Train_X = pickle.load(file=filePath)
    Train_Y = pickle.load(file=filePath)
    Test_X = pickle.load(file=filePath)
    Test_Y = pickle.load(file=filePath)

In [6]:
print(Train_X.shape)
print(Train_Y.shape)
print(Test_X.shape)
print(Test_Y.shape)
print(Test_Y.unique())

(151483, 1743)
(151483,)
(37871, 1743)
(37871,)
[2.  4.  3.  2.5 3.5]


## Classification Models

In [6]:
# Run through multiple classifiers and rank results

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

def AssessClassifierModels(TrainingDataColumns, TrainingDataResults, TestingDataColumns, TestingDataResults, Algorithms_List):
    from datetime import datetime
    functionStartTime = datetime.now()
    print()
    
    # results container
    results_list = pd.DataFrame( columns = ["Name",
                                            "Precision",
                                            "Recall",
                                            "F1",
                                            "Support",
                                            "ModelData",
                                            "ExecutionTime",
                                           ]) # Set index later to avoid empty row
    
    # calculated metrics and append to list
    for algorithm in Algorithms_List:
        loopStartTime = datetime.now()
        print("Starting " + str(algorithm.__name__) + " at " + str(loopStartTime))

        algorithmObject = algorithm()
        
        if(str(algorithm.__name__) == "XGBClassifier"):
            algorithmObject = XGBClassifier(nthread=4)

        algorithmObject.fit(TrainingDataColumns, TrainingDataResults)
        algorithmPredictions = algorithmObject.predict(TestingDataColumns)
        (algorithmPrecision, algorithmRecall, algorithmF1, algorithmSupportList) = precision_recall_fscore_support(
            TestingDataResults, algorithmPredictions, labels = np.sort(TrainingDataResults.unique()))
        algorithmExecutionTime = str(datetime.now() - loopStartTime)
        
        results_list = results_list.append({"Name":  algorithm.__name__,
                                            "Precision": algorithmPrecision,
                                            "Recall": algorithmRecall,
                                            "F1": algorithmF1,
                                            "Support": algorithmSupportList,
#                                            "ConfusionMatrix": "",# confusion_matrix(TestingDataResults, algorithmPredictions),
                                            "ModelData" : algorithmObject,
                                            "ExecutionTime": algorithmExecutionTime, 
                                            }, ignore_index = True)
#         print("\tEnding " + str(algorithm.__name__) + " at " + str(datetime.now()) + "\n")
        
#         with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
#             print(results_list.iloc[len(results_list)-1, :])

    # Set index to a meaningful value
    results_list.set_index("Name")
    print("Assessment Complete.")
    return results_list

In [8]:
#StarPolarity_Map = {1:-1.0, 2:-0.5, 3:0.0, 4:0.5, 5:1.0}
StarPolarity_Map = {1:2.0, 2:2.5, 3:3.0, 4:3.5, 5:4.0}
ClassifierResults_List = AssessClassifierModels(Train_X, Train_Y.map(StarPolarity_Map).apply(str).astype("category"), Test_X, Test_Y.map(StarPolarity_Map).apply(str).astype("category"), [
    XGBClassifier,
    MultinomialNB,
    GaussianNB,
    BernoulliNB,
    #KNeighborsClassifier,
    #DecisionTreeClassifier,
    ExtraTreeClassifier
])


Starting XGBClassifier at 2020-01-25 05:53:08.755046
Starting MultinomialNB at 2020-01-25 06:02:08.412566
Starting GaussianNB at 2020-01-25 06:02:11.383606
Starting BernoulliNB at 2020-01-25 06:02:36.073686
Starting DecisionTreeClassifier at 2020-01-25 06:03:25.876884
Starting ExtraTreeClassifier at 2020-01-25 06:06:50.863603
Assessment Complete.


In [9]:
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
    print(ClassifierResults_List[["Name", "Precision", "Recall", "F1", "Support", "ExecutionTime"]])

                     Name  \
0           XGBClassifier   
1           MultinomialNB   
2              GaussianNB   
3             BernoulliNB   
4  DecisionTreeClassifier   
5     ExtraTreeClassifier   

                                                                                                 Precision  \
0    [0.43410852713178294, 0.62510845045983, 0.46622542339887096, 0.45076060848678945, 0.5988313520048358]   
1    [0.32357043235704325, 0.6955237446034992, 0.4434848484848485, 0.5111710323574731, 0.6925350122060902]   
2    [0.2851963746223565, 0.6786147419485244, 0.3448133635801196, 0.39832775919732444, 0.4130954570535222]   
3  [0.30156537753222834, 0.6761139311900733, 0.37977315689981095, 0.45304172027200884, 0.3993250127356088]   
4    [0.21397849462365592, 0.5853964632059326, 0.2966114572253457, 0.3647869815798395, 0.5114114779525815]   
5    [0.19717376904393905, 0.4862628268785171, 0.2645865834633385, 0.3355742935278031, 0.4458374573848015]   

                         

In [10]:
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\ClassifierResults.dat", "wb") as filePath:
    pickle.dump(ClassifierResults_List, file=filePath, protocol = pickle.HIGHEST_PROTOCOL)

In [12]:
import gc
del ClassifierResults_List
gc.collect()
gc.collect()

0

In [5]:
import gc
#del TrainingData
#del WordCounts
gc.collect()
gc.collect()

0

## Regression Models

In [11]:
# Run through multiple Regression models and rank results
import pandas as pd

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

def AssessRegressionModels(x_train, y_train, x_test, y_test, algorithms_list):
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

    from datetime import datetime

    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    # results container
    results_list = pd.DataFrame()
    
    # calculated metrics and append to list
    functionStartTime = datetime.now()
    for algorithm in algorithms_list:
        algorithmObject = algorithm()
        if(str(algorithm.__name__) == "XGBRegressor"):
            algorithmObject = XGBRegressor(objective = "reg:squarederror", nthread=6)
            
        loopStartTime = datetime.now()
        print("Starting " + str(algorithm.__name__) + " at " + str(loopStartTime))

        algorithmObject.fit(x_train, y_train)
        y_predictor = algorithmObject.predict(x_test)
        
        algorithmExecutionTime = str(datetime.now() - loopStartTime)

        results_list = results_list.append({"Name" : algorithm.__name__, 
                                            "R2Score": r2_score(y_test, y_predictor),
                                            "RMSE": (mean_squared_error(y_test, y_predictor)),
                                            "MAE": mean_absolute_error(y_test, y_predictor),
                                            "ModelData" : algorithmObject,
                                            "RunTime" : algorithmExecutionTime,
                                            }, ignore_index = True)

        print("\tEnding " + str(algorithm.__name__) + " at " + str(datetime.now()))

    # sort list by r2 score
    results_list.sort_values(by=['R2Score'], inplace=True, ascending = False)

    return results_list

In [16]:
RegressorResults_List = AssessRegressionModels(Train_X, Train_Y, Test_X, Test_Y, [
    XGBRegressor,
    GradientBoostingRegressor,
    ExtraTreeRegressor,
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
])

with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
    print(RegressorResults_List.drop("ModelData", axis = 1))

Starting XGBRegressor at 2020-01-26 14:33:46.632344
	Ending XGBRegressor at 2020-01-26 14:35:09.385311
Starting GradientBoostingRegressor at 2020-01-26 14:35:09.386310
	Ending GradientBoostingRegressor at 2020-01-26 14:43:50.886106
Starting ExtraTreeRegressor at 2020-01-26 14:43:50.887105
	Ending ExtraTreeRegressor at 2020-01-26 14:46:28.680589
Starting LinearRegression at 2020-01-26 14:46:28.680589
	Ending LinearRegression at 2020-01-26 14:46:45.443238
Starting Ridge at 2020-01-26 14:46:45.444238
	Ending Ridge at 2020-01-26 14:46:53.216283
Starting Lasso at 2020-01-26 14:46:53.216283
	Ending Lasso at 2020-01-26 14:46:57.712851
Starting ElasticNet at 2020-01-26 14:46:57.713851
	Ending ElasticNet at 2020-01-26 14:47:02.520330
        MAE                       Name   R2Score      RMSE         RunTime
4  0.408997                      Ridge  0.541973  0.252671  0:00:07.764048
3  0.408996           LinearRegression  0.541968  0.252674  0:00:16.757650
0  0.422029               XGBRegressor  

In [10]:
import pickle
with open("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\RegressorResults.dat", "wb") as filePath:
    pickle.dump(RegressorResults_List, file=filePath)

In [4]:
import pandas as pd
def AssessXGBRegression(x_train, y_train, x_test, y_test, Model):
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    from xgboost import XGBRegressor
    from datetime import datetime
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    loopStartTime = datetime.now()
    # results container
    results_list = pd.DataFrame( )#columns = ["Name", "R2 Score", "RMS Error", "Mean Absolute Error", "ModelData", "ExecutionTime"])


    print("Starting XGBRegressor at " + str(loopStartTime))

    Model.fit(x_train, y_train)
    xgbPredictor = Model.predict(x_test)

    loopEndTime = datetime.now()
    results_list = results_list.append({"Name" : "XGBRegressor",
                                        "R2Score": r2_score(y_test, xgbPredictor),
                                        "RMSE": (mean_squared_error(y_test, xgbPredictor)),
                                        "MAE": mean_absolute_error(y_test, xgbPredictor),
                                        "ModelData" : Model,
                                        "RunTime": str(loopEndTime - loopStartTime),
#                                         "LearningRate": str(LearningRate),
#                                         "MaxDepth":str(MaxDepth),
                                        }, ignore_index = True)

    print("\tEnding XGBRegressor at " + str(datetime.now()))

    return results_list

### Trialing different XGB parameters

In [3]:
#import pandas as pd
#XGBResults = pd.DataFrame()
from xgboost import XGBRegressor
with open("P:\\Temp\\XGBRegressorResults.dat", "wb") as filePath:
    # Instantiate model parameters to vary
    estimator_List = [10, 50, 100, 200, 400]
    learningRate_List = [0.01, 0.05, 0.1, 0.33, 0.5]
    maxDepth_List = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    
    # Model every combination
    for estimatorsCount in estimator_List:
        import warnings
        warnings.simplefilter(action='ignore', category=FutureWarning)
        for learningRate in learningRate_List:
            for maxDepth in maxDepth_List:

                xgbModel = XGBRegressor(objective = "reg:squarederror",
                                        colsample_bytree = 1,
                                        colsample_bylevel = 1,
                                        colsample_bynode = 1,
                                        learning_rate = learningRate,
                                        max_depth = maxDepth,
                                        tree_method = "hist",
                                        grow_policy = "lossguide",
                                        n_estimators = estimatorsCount,
                                        nthread = 10,
                                       )
                testTrainResults = AssessXGBRegression(Train_X, Train_Y, Test_X, Test_Y, Model = xgbModel)
                testTrainResults["LearnRate"] = learningRate
                testTrainResults["MaxDepth"] = maxDepth
                testTrainResults["Estimators"] = estimatorsCount

                with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
                    display(testTrainResults.drop(["Name", "ModelData"], axis = 1))

                #XGBResults = XGBResults.append(testTrainResults, ignore_index = True)

                pickle.dump(testTrainResults, file=filePath)
                
print("Loops Complete.")

Starting XGBRegressor at 2020-01-26 10:03:53.907183
	Ending XGBRegressor at 2020-01-26 10:04:12.862094


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313265,-9.665592,5.883688,0:00:18.948913,0.01,3,10


Starting XGBRegressor at 2020-01-26 10:04:12.892085
	Ending XGBRegressor at 2020-01-26 10:04:32.366828


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313266,-9.658454,5.87975,0:00:19.469745,0.01,4,10


Starting XGBRegressor at 2020-01-26 10:04:32.737710
	Ending XGBRegressor at 2020-01-26 10:04:52.788269


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313289,-9.652688,5.876569,0:00:20.044561,0.01,5,10


Starting XGBRegressor at 2020-01-26 10:04:53.189140
	Ending XGBRegressor at 2020-01-26 10:05:14.144408


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313295,-9.647512,5.873714,0:00:20.949270,0.01,6,10


Starting XGBRegressor at 2020-01-26 10:05:14.513290
	Ending XGBRegressor at 2020-01-26 10:05:36.568206


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313364,-9.643554,5.87153,0:00:22.049917,0.01,7,10


Starting XGBRegressor at 2020-01-26 10:05:36.951083
	Ending XGBRegressor at 2020-01-26 10:06:00.354566


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313387,-9.639607,5.869353,0:00:23.397484,0.01,8,10


Starting XGBRegressor at 2020-01-26 10:06:00.740441
	Ending XGBRegressor at 2020-01-26 10:06:25.583461


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313413,-9.636019,5.867373,0:00:24.837022,0.01,9,10


Starting XGBRegressor at 2020-01-26 10:06:25.974336
	Ending XGBRegressor at 2020-01-26 10:06:53.188593


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.31345,-9.632741,5.865565,0:00:27.208259,0.01,10,10


Starting XGBRegressor at 2020-01-26 10:06:53.573470
	Ending XGBRegressor at 2020-01-26 10:07:22.694116


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313507,-9.629753,5.863917,0:00:29.115647,0.01,11,10


Starting XGBRegressor at 2020-01-26 10:07:23.092987
	Ending XGBRegressor at 2020-01-26 10:07:54.725826


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,2.313547,-9.62743,5.862635,0:00:31.626841,0.01,12,10


Starting XGBRegressor at 2020-01-26 10:07:55.139694
	Ending XGBRegressor at 2020-01-26 10:08:13.909664


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.532857,-4.126675,2.828137,0:00:18.763972,0.05,3,10


Starting XGBRegressor at 2020-01-26 10:08:14.316533
	Ending XGBRegressor at 2020-01-26 10:08:33.663319


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.532855,-4.101343,2.814163,0:00:19.341787,0.05,4,10


Starting XGBRegressor at 2020-01-26 10:08:34.051195
	Ending XGBRegressor at 2020-01-26 10:08:54.062766


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.53296,-4.080097,2.802442,0:00:20.006572,0.05,5,10


Starting XGBRegressor at 2020-01-26 10:08:54.460638
	Ending XGBRegressor at 2020-01-26 10:09:15.404911


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.532958,-4.060857,2.791828,0:00:20.938274,0.05,6,10


Starting XGBRegressor at 2020-01-26 10:09:15.787788
	Ending XGBRegressor at 2020-01-26 10:09:37.944670


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.533023,-4.044784,2.782962,0:00:22.150884,0.05,7,10


Starting XGBRegressor at 2020-01-26 10:09:38.336544
	Ending XGBRegressor at 2020-01-26 10:10:02.408812


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.533316,-4.032661,2.776274,0:00:24.066270,0.05,8,10


Starting XGBRegressor at 2020-01-26 10:10:02.800686
	Ending XGBRegressor at 2020-01-26 10:10:29.039764


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.533246,-4.01892,2.768693,0:00:26.233080,0.05,9,10


Starting XGBRegressor at 2020-01-26 10:10:29.458134
	Ending XGBRegressor at 2020-01-26 10:10:57.940984


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.533467,-4.008284,2.762826,0:00:28.476852,0.05,10,10


Starting XGBRegressor at 2020-01-26 10:10:58.335858
	Ending XGBRegressor at 2020-01-26 10:11:29.642801


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.533735,-3.99874,2.757561,0:00:31.300945,0.05,11,10


Starting XGBRegressor at 2020-01-26 10:11:30.042673
	Ending XGBRegressor at 2020-01-26 10:12:04.380643


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,1.533958,-3.99128,2.753446,0:00:34.332971,0.05,12,10


Starting XGBRegressor at 2020-01-26 10:12:04.788512
	Ending XGBRegressor at 2020-01-26 10:12:23.329555


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.927763,-1.2373,1.234209,0:00:18.535045,0.1,3,10


Starting XGBRegressor at 2020-01-26 10:12:23.752420
	Ending XGBRegressor at 2020-01-26 10:12:42.825293


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.924171,-1.200598,1.213963,0:00:19.067874,0.1,4,10


Starting XGBRegressor at 2020-01-26 10:12:43.209170
	Ending XGBRegressor at 2020-01-26 10:13:02.941831


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.921691,-1.169522,1.196819,0:00:19.726664,0.1,5,10


Starting XGBRegressor at 2020-01-26 10:13:03.346701
	Ending XGBRegressor at 2020-01-26 10:13:24.134529


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.920289,-1.145548,1.183594,0:00:20.782829,0.1,6,10


Starting XGBRegressor at 2020-01-26 10:13:24.519405
	Ending XGBRegressor at 2020-01-26 10:13:46.628303


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.917756,-1.120448,1.169748,0:00:22.103900,0.1,7,10


Starting XGBRegressor at 2020-01-26 10:13:47.020177
	Ending XGBRegressor at 2020-01-26 10:14:11.282384


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.916844,-1.102582,1.159892,0:00:24.257208,0.1,8,10


Starting XGBRegressor at 2020-01-26 10:14:11.685255
	Ending XGBRegressor at 2020-01-26 10:14:38.238725


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.916586,-1.087505,1.151575,0:00:26.547471,0.1,9,10


Starting XGBRegressor at 2020-01-26 10:14:38.631598
	Ending XGBRegressor at 2020-01-26 10:15:08.184105


Unnamed: 0,MAE,R2Score,RMSE,RunTime,LearnRate,MaxDepth,Estimators
0,0.916057,-1.073581,1.143894,0:00:29.547509,0.1,10,10


Starting XGBRegressor at 2020-01-26 10:15:08.586976


KeyboardInterrupt: 

In [None]:
XGBResults

In [None]:
testTrainResults

In [15]:
XGBResults = AssessXGBRegression(Train_X, Train_Y, Test_X, Test_Y,
                                 XGBRegressor(objective = "reg:squarederror",
                                              colsample_bytree = 1,
                                              colsample_bylevel = 1,
                                              colsample_bynode = 1,
                                              learning_rate = 0.0025,
                                              max_depth = 9,
                                              tree_method = "hist",
                                              grow_policy = "lossguide",
                                              n_estimators = 800,
                                              nthread = 6,
                                             )
                                )

with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
    print(XGBResults.drop("ModelData", axis = 1))

Starting XGBRegressor at 2020-01-26 12:58:10.467368
	Ending XGBRegressor at 2020-01-26 13:10:05.905553
        MAE          Name   R2Score      RMSE         RunTime
0  0.530468  XGBRegressor  0.255024  0.410967  0:11:55.432187


In [14]:
import pickle
with open("XGBRegressorResults.dat", "wb") as filePath:
    pickle.dump(XGBResults, file=filePath)

In [6]:
import pandas as pd
XGBResults = pd.DataFrame()

for learningRate in [0.01, 0.05, 0.1, 0.33, 0.5]:
    from xgboost import XGBRegressor
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for treeDepth in [3, 6, 7, 8, 9, 10, 11, 12]:
        
        xgbModel = XGBRegressor(objective = "reg:squarederror",
                                colsample_bytree = 1,
                                colsample_bylevel = 1,
                                colsample_bynode = 1,
                                learning_rate = learningRate,
                                max_depth = treeDepth,
                                tree_method = "hist",
                                grow_policy = "lossguide",
                                n_estimators = 200,
                                nthread = 6,
                               )
        testTrainResults = AssessXGBRegression(Train_X, Train_Y, Test_X, Test_Y, Model = xgbModel)

        with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
            print(testTrainResults.drop("ModelData", axis = 1))

        XGBResults.append(testTrainResults, ignore_index = True)

print("Loops Complete.")

Starting XGBRegressor at 2020-01-25 06:40:08.869883
	Learning Rate:  0.1 	Tree Depth:  6
	Ending XGBRegressor at 2020-01-25 06:42:26.360086
           Name  R2 Score  RMS Error  Mean Absolute Error   ExecutionTime  \
0  XGBRegressor  0.636572   0.801942             0.711581  0:02:17.474211   

  LearningRate MaxDepth  
0          0.1        6  
Starting XGBRegressor at 2020-01-25 06:42:26.882945
	Learning Rate:  0.1 	Tree Depth:  8
	Ending XGBRegressor at 2020-01-25 06:46:26.481334
           Name  R2 Score  RMS Error  Mean Absolute Error   ExecutionTime  \
0  XGBRegressor  0.654013   0.763456             0.688201  0:03:59.598389   

  LearningRate MaxDepth  
0          0.1        8  
Starting XGBRegressor at 2020-01-25 06:46:27.056147
	Learning Rate:  0.1 	Tree Depth:  12
	Ending XGBRegressor at 2020-01-25 06:59:06.268908
           Name  R2 Score  RMS Error  Mean Absolute Error   ExecutionTime  \
0  XGBRegressor  0.667487   0.733724             0.666632  0:12:39.197779   

  Learning

In [4]:
XGBResults.head()

In [None]:
import gc
#Del XGBResults
gc.collect()
gc.collect()


In [None]:
XGBResults = AssessXGBRegression(Train_X, Train_Y.apply(lambda x: x - 2.5), Test_X, Test_Y.apply(lambda x: x - 2.5), LearningRate = 0.2, MaxDepth = 9)

Starting XGBRegressor at 2020-01-25 08:26:04.262735
	Learning Rate:  0.2 	Tree Depth:  9


In [None]:
import pickle
with open("XGBRegressorResults.dat", "wb") as filePath:
    pickle.dump(XGBResults, file=filePath)

In [3]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from xgboost import XGBRegressor
from datetime import datetime
import gc

def AssessXGBRegressionModels(x_train, y_train, x_test, y_test):
    import gc
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    # results container
    results_list = pd.DataFrame( columns = ["Name", "R2 Score", "RMS Error", "Mean Absolute Error", "ModelData", "ExecutionTime"])

    for learningRate in [0.5, 0.1, 0.01]:
        for maxDepth in [3, 6, 9]:
            loopStartTime = datetime.now()

            xgbModel = XGBRegressor(objective = "reg:squarederror",
                                    colsample_bytree = 1,
                                    colsample_bylevel = 1,
                                    colsample_bynode = 1,
                                    learning_rate = learningRate,
                                    max_depth = maxDepth,
                                    tree_method = "hist",
                                    grow_policy = "lossguide",
                                    n_estimators = 200,
                                    nthread = 6,
                                   )

            print("Starting XGBRegressor at " + str(loopStartTime))
            print("\tLearning Rate: ", str(learningRate), "\tTree Depth: ", str(maxDepth))

            xgbModel.fit(x_train, y_train)
            xgbPredictor = xgbModel.predict(x_test)

            loopEndTime = datetime.now()
            results_list = results_list.append({"Name" : "XGBRegressor",
                                                "R2 Score": r2_score(y_test, xgbPredictor),
                                                "RMS Error": (mean_squared_error(y_test, xgbPredictor)),
                                                "Mean Absolute Error": mean_absolute_error(y_test, xgbPredictor),
                                                "ModelData" : xgbModel,
                                                "ExecutionTime": str(loopEndTime - loopStartTime),
                                                "LearningRate": str(learningRate),
                                                "MaxDepth":str(maxDepth),
                                                }, ignore_index = True)

            print("\tEnding XGBRegressor at " + str(datetime.now()))
            del xgbModel
            del xgbPredictor
            gc.collect()
    
    with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", 1000):
        print(results_list.drop("ModelData", axis = 1))

    gc.collect()
    gc.collect()
    print("Loops Complete.")

    return results_list