In [17]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, 
                                   RobustScaler, 
                                   PolynomialFeatures)

from sklearn.pipeline import (make_pipeline, 
                             make_union, 
                             FeatureUnion)

from tpot.builtins import StackingEstimator, OneHotEncoder

from tpot.export_utils import set_param_recursive
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (VotingClassifier, 
                             StackingClassifier,
                             BaggingClassifier,
                             RandomForestClassifier, 
                             AdaBoostClassifier, 
                             ExtraTreesClassifier, 
                             GradientBoostingClassifier)

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE, SVMSMOTE
from sklearn.utils import shuffle
from sklearn.base import clone 
from sklearn.decomposition import FastICA
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)


In [None]:
    
def pay_method_models(pay_method):   
    
    print("Payment Method: ", pay_method)
    
    if pay_method == 'DDR':
        clf = ExtraTreesClassifier(bootstrap=False, 
                                     criterion="gini", 
                                     max_features=0.1, 
                                     min_samples_leaf=1, 
                                     min_samples_split=3, 
                                     n_estimators=100)

    elif pay_method == 'DEO': 
        clf = make_pipeline(
                StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, 
                                                                 criterion="entropy", 
                                                                 max_features=0.45, 
                                                                 min_samples_leaf=2, 
                                                                 min_samples_split=15, 
                                                                 n_estimators=100)),
                StackingEstimator(estimator=LinearSVC(C=20.0, 
                                                      dual=False, 
                                                      loss="squared_hinge", 
                                                      penalty="l2", tol=1e-05)),
                OneHotEncoder(minimum_fraction=0.1, 
                              sparse=False, 
                              threshold=10),
                GradientBoostingClassifier(learning_rate=0.1, 
                                           max_depth=8, 
                                           max_features=0.4, 
                                           min_samples_leaf=16, 
                                           min_samples_split=7, 
                                           n_estimators=100, 
                                           subsample=0.7000000000000001)
            )

    elif pay_method == 'SDO':
        clf = make_pipeline(
                PCA(iterated_power=7, 
                    svd_solver="randomized"),
                ExtraTreesClassifier(bootstrap=False, 
                                     criterion="entropy", 
                                     max_features=0.8, 
                                     min_samples_leaf=1, 
                                     min_samples_split=4, 
                                     n_estimators=100)
            )

    elif pay_method == 'DFB': 
        clf = make_pipeline(
                PCA(iterated_power=9, 
                    svd_solver="randomized"),
                GradientBoostingClassifier(learning_rate=0.5, 
                                           max_depth=9, 
                                           max_features=0.9500000000000001, 
                                           min_samples_leaf=13, 
                                           min_samples_split=7, 
                                           n_estimators=100, 
                                           subsample=0.7000000000000001)
            )

    elif pay_method == 'VSO': 
        clf = make_pipeline(
                FastICA(tol=0.05),
                StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, 
                                                                       max_depth=9, 
                                                                       max_features=0.2, 
                                                                       min_samples_leaf=4, 
                                                                       min_samples_split=20, 
                                                                       n_estimators=100, 
                                                                       subsample=1.0)),
                GradientBoostingClassifier(learning_rate=0.1, 
                                           max_depth=9, 
                                           max_features=0.1, 
                                           min_samples_leaf=3, 
                                           min_samples_split=15, 
                                           n_estimators=100, 
                                           subsample=0.5)
            )
        
    return clf

In [19]:
def build_model(grid_search=False):
    
    XGB = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bynode=1, colsample_bytree=1.0,
                  enable_categorical=False, eta=0.4, gamma=0.5, gpu_id=-1,
                  importance_type=None, interaction_constraints='',
                  learning_rate=0.400000006, max_delta_step=0, max_depth=6,
                  min_child_weight=1,monotone_constraints='()',
                  n_estimators=100, n_jobs=12, num_parallel_tree=4,
                  predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
                  #scale_pos_weight=1, 
                        subsample=0.8, tree_method='exact',
                  validate_parameters=1, verbosity=None, num_class=1)

    ETC = ExtraTreesClassifier(bootstrap=True, 
                                 criterion="gini", 
                                 max_features=0.4, 
                                 min_samples_leaf=5, 
                                 min_samples_split=16, 
                                 n_estimators=100)


    ETC2 = ExtraTreesClassifier(bootstrap=True, 
                                 criterion="entropy", 
                                 max_features=0.8, 
                                 min_samples_leaf=2, 
                                 min_samples_split=2, 
                                 n_estimators=100)

    RFC = RandomForestClassifier(bootstrap=False, 
                                 criterion="gini",
                                 max_depth=24,#24
                                 max_features=0.25, #0.25
                                 min_samples_leaf=1, 
                                 min_samples_split=4, 
                                 n_estimators=1000)#50


    GBC = GradientBoostingClassifier(learning_rate=0.1, 
                                   max_depth=8, 
                                   max_features=0.15000000000000002, 
                                   min_samples_leaf=16, 
                                   min_samples_split=10, 
                                   n_estimators=100, 
                                   subsample=0.25)

    svc = SVC()
    ABC = AdaBoostClassifier()
    DTC = DecisionTreeClassifier()
    KNC = KNeighborsClassifier()
    LR  = LogisticRegression()
    MNB = MultinomialNB()
    CAT = CatBoostClassifier(silent=True)
    

    """  Transformers. """
    Column_Transformer = make_column_transformer(
                          
                          (OneHotEncoder(),
                           make_column_selector(dtype_include=object))) 
    
    Transformers = [ ('col_trans', Column_Transformer),
                    #('ohe', OneHotEncoder()),
                    #('Scaler', RobustScaler()),
                    #('kernel_pca', KernelPCA()),
                    #('reduce_dim', PCA(.98)),
                    #('KBest', SelectKBest(chi2, k=35)), #.fit_transform(X, y)),
                    #('FEATURE', PolynomialFeatures(degree=2,include_bias=False))
    ]
    ## Combine transformed features
    Transformer_Union = FeatureUnion(Transformers)
    """********************************************"""

    """ Estimators """
    # Create Base Learners for stacking. XGB+RFC+CAT = 0.948
    base_learners = [
                      ('XGB', XGB), # 0.941 # 0.945
                      #('ETC', ETC), # 0.930 # 0.938
                      ('RFC', RFC), # 0.943 # 0.947
                      #('DTC', DTC), # 0.916 # 0.914
                      #('GBC', GBC), # 0.932 # 0.938
                      #('SVC', svc), # 0.830
                      #('KNC', KNC), # 0.848
                      #('ABC', ABC), # 0.906
                      #('LR', LR),   # 0873
                      #('MNB', MNB),  # 0.812
                      ('CAT', CAT),  # ----- # 0.945
    ]

    # # Initialize Stacking Classifier with the Meta Learner
    stack = StackingClassifier(estimators      = base_learners, 
                               final_estimator = LR,
                              verbose=2) # Base=XGB=0.941. XGB+RFC+CAT = 0.950
    ## Voting Classifier
    vote = VotingClassifier(estimators = base_learners, #XGB=0.941
                            voting     ='soft');
    ## Bagging Classifier
    bag = BaggingClassifier(base_estimator = CAT, # XGB=0.940 RFC=0.941
                            n_estimators   = 100, # maybe n_estimatoers is too small. 
                            random_state   = 0)
    """****************************************"""

    """ Clf PIPELINE """
    clf = make_pipeline(None, RFC); ## use * infront of Transformers to unpack list when not using union.
    
    """******************************************"""

    """  GRID SEARCH. """
    if grid_search:
        
        X_train, X_test, y_train, y_test, df = get_data(_SMOTE=_SMOTE)
        clf = RandomizedSearchCV(estimator = clf, param_distributions = get_gridsearch(), 
                                                   n_iter = 100, #100
                                                   cv = 3,#3 
                                                   verbose=2, 
                                                   random_state=42, 
                                                   n_jobs = -1)
        ## Retrain on best estimator 
        clf.fit(X_train, y_train)
        print("best_estimator_ \t\n:", clf.best_estimator_)
        print("best_score_ \t\n",      clf.best_score_)
        print("best_params_ \t\n",     clf.best_params_)
        clf = clf.best_estimator_
        print(clf) 
    """************************************************************"""

    return clf

In [20]:
### Randomised Grid Search

def get_gridsearch():
    
    """
        Params example for nested classifiers.
        Use 'clf.get_params()'' to get a list of the different parameters available.
        
        params = [{'votingclassifier__XGB__learning_rate':    [0.1, 0.01, 0.001, 0.0001],
                   'votingclassifier__XGB__min_child_weight': [1,2,3,4],
                   'votingclassifier__XGB__subsample':        [0.2, 0.4, 0.6, 0.8]}]
        """
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 1, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(1, 200, num = 10)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    # Create the random grid
    random_grid = {'randomforestclassifier__n_estimators': n_estimators,
                   'randomforestclassifier__max_features': max_features,
                   'randomforestclassifier__max_depth': max_depth,
                   'randomforestclassifier__min_samples_split': min_samples_split,
                   'randomforestclassifier__min_samples_leaf': min_samples_leaf,
                   'randomforestclassifier__bootstrap': bootstrap}
    #pprint(random_grid)

    return random_grid


In [None]:
def train(grid_search=False, _SMOTE=False):
    
    X_train, X_test, y_train, y_test, df = get_data(_SMOTE=_SMOTE)

    """ get model. """
    clf = build_model(grid_search)

    """ Train. """
    np.random.seed(4)
    clf.fit(X_train, y_train)

    """ Test. """
    preds = clf.predict(X_test)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print()
    print("Train Score \t", train_score)
    print("Test Score \t", test_score)
    ## of all the cases predicted positive, how many were actually positive?
    print("precision_score ", precision_score(y_test, preds))
    ## of all the positive cases, how may did the model identify?
    print("recall_score \t", recall_score(y_test, preds))
        
    return train_score, test_score, preds, clf, X_train, y_train, X_test, y_test, df
    # 0.94644

In [None]:
data = pd.read_csv("../input/spaceship-titanic/train.csv")
data = data.dropna()

#display(data)
data2 = data.drop('Name', axis=1)
data3 = data2 *1
data3 = data3.dropna()
## split Cabin into deck and side. 
data3['deck'] = [x[0] for x in data3['Cabin']]
data3['side'] = [x[-1] for x in data3['Cabin']]
data3['side'].replace(['P', 'S'], [0, 1], inplace=True)
data3['VIP'].replace([0, 1], [0, 1], inplace=True)
data3['CryoSleep'].replace([0, 1], [0, 1], inplace=True)

data3.drop('Cabin', axis=1, inplace=True)

data3['PassengerId'] = data3['PassengerId'].astype(str)
## remove individual identifyer from end of ID.
data3['PassengerId'] = [x[0:-1] for x in data3['PassengerId']]

data3['PassengerId'] = [x.replace('_','') for x in data3['PassengerId']]

data3['PassengerId'] = data3['PassengerId'].astype(int)
data3['age_group'] = pd.cut(data3['Age'], 
                            bins=[-1, 10, 19, 29, 49, 80],
                           labels=[10, 19, 29, 49, 80]).astype(int)
data3['FoodCourt_group'] = pd.cut(data3['FoodCourt'], 
                            bins=[-1, 0, 10, 100, 1000, 30000],
                           labels=[0, 10, 100, 1000, 30000]).astype(int)

data3['RoomService_group'] = pd.cut(data3['RoomService'], 
                            bins=[-1, 0, 10, 100, 1000, 10000],
                           labels=[0, 10, 100, 1000, 10000]).astype(int)

data3['ShoppingMall_group'] = pd.cut(data3['ShoppingMall'], 
                            bins=[-1, 0, 10, 100, 1000, 15000],
                           labels=[0, 10, 100, 1000, 15000]).astype(int)

data3['Spa_group'] = pd.cut(data3['Spa'], 
                            bins=[-1, 0, 10, 100, 1000, 30000],
                           labels=[0, 10, 100, 1000, 30000]).astype(int)

data3['VRDeck_group'] = pd.cut(data3['VRDeck'], 
                            bins=[-1, 0, 10, 100, 1000, 21000],
                           labels=[0, 10, 100, 1000, 21000]).astype(int)


bob = pd.get_dummies(data3[ ['HomePlanet','Destination', 'deck']])

data3 = data3.drop('HomePlanet', axis=1)
data3 = data3.drop('Destination', axis=1)
data3 = data3.drop('deck', axis=1)
data3 = data3.drop('Age', axis=1)
data3 = data3.drop('RoomService', axis=1)
data3 = data3.drop('FoodCourt', axis=1)
data3 = data3.drop('ShoppingMall', axis=1)
data3 = data3.drop('Spa', axis=1)
data3 = data3.drop('VRDeck', axis=1)


data3 = data3.join(bob)

mid = data3['Transported']
data3.drop(labels=['Transported'], axis=1,inplace = True)
data3.insert(0, 'Transported', mid)
data3 = data3.reset_index(drop=True)

scaler = StandardScaler()
cat =  pd.DataFrame(scaler.fit_transform(data3[['PassengerId', 'age_group', 'FoodCourt_group', 'RoomService_group', 'ShoppingMall_group', 'Spa_group', 'VRDeck_group']]), columns=['PassengerId', 'age_group', 'FoodCourt_group', 'RoomService_group', 'ShoppingMall_group', 'Spa_group', 'VRDeck_group'])
data3 = data3.drop(['PassengerId', 'age_group', 'FoodCourt_group', 'RoomService_group', 'ShoppingMall_group', 'Spa_group', 'VRDeck_group'], axis=1)
data3 = data3.join(cat)

#display(data3.tail(50))

X, y = data3.iloc[:, 1:-1], data3.iloc[:, 0]


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                           test_size=0.15, 
                                           shuffle=True,
                                           stratify = y,
                                           random_state=42)


In [14]:
# import pandas as pd
# print(len(iris))
# print(iris)
# # bob = pd.DataFrame(iris.data)
# bob

In [18]:
import sklearn
X, y = train_test_split(X, y,
                       test_size=0.10, 
                       shuffle=True,
                       stratify = y,
                       random_state=42,
                       )


TypeError: train_test_split() got an unexpected keyword argument 'seed'

In [22]:
import sys
for line in sys.path:
    print(line)

/Users/dominic.mckean/Library/CloudStorage/OneDrive-SecureEngineering/notebooks
/opt/anaconda3/envs/test/lib/python37.zip
/opt/anaconda3/envs/test/lib/python3.7
/opt/anaconda3/envs/test/lib/python3.7/lib-dynload

/opt/anaconda3/envs/test/lib/python3.7/site-packages
/opt/anaconda3/envs/test/lib/python3.7/site-packages/IPython/extensions
/Users/dominic.mckean/.ipython
