In [1]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt

# models libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier

# feature selection libraries
from boruta import BorutaPy
from scipy.stats import normaltest
from statsmodels.stats.outliers_influence import variance_inflation_factor  
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import VarianceThreshold

#### others
# additional, outlier detection?
from sklearn.ensemble import IsolationForest
# preprocessing
from sklearn.preprocessing import StandardScaler
# maybe in use - pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# score
from sklearn.metrics import balanced_accuracy_score
import random

In [2]:
Xa_train = pd.read_csv("data/artificial_train.data", sep = " ", header = None).loc[:, 0:499]
ya_train = pd.read_csv("data/artificial_train.labels", sep = " ", header = None)
Xa_test = pd.read_csv("data/artificial_valid.data", sep = " ", header = None).loc[:, 0:499]

Xd_train = pd.read_csv("data/digits_train.data", sep = " ", header = None).loc[:, 0:4999]
yd_train = pd.read_csv("data/digits_train.labels", sep = " ", header = None)
Xd_test = pd.read_csv("data/digits_valid.data", sep = " ", header = None).loc[:, 0:4999]

# Variance Threshold 
Removing columns with 0 variance in test or train datasets.

In [55]:
np.where(Xd_train.var()== 0)

(array([ 111,  119,  196,  421,  479,  793,  876, 1037, 1040, 1179, 1267,
        1639, 1736, 1792, 1835, 1904, 2022, 2086, 2198, 2248, 2348, 2585,
        2604, 2686, 2691, 2811, 2901, 2909, 2952, 3007, 3026, 3157, 3193,
        3476, 3556, 3631, 3706, 3745, 3864, 4066, 4100, 4255, 4388, 4872,
        4964], dtype=int64),)

In [16]:
np.where(Xd_test.var()== 0)

(array([  18,   21,   29,   50,   63,  107,  119,  129,  137,  149,  157,
         175,  182,  242,  299,  339,  350,  421,  437,  460,  475,  479,
         484,  539,  565,  608,  620,  625,  628,  635,  647,  650,  664,
         674,  675,  705,  713,  737,  738,  744,  773,  775,  790,  793,
         809,  822,  850,  876,  899,  901,  913,  924,  945,  971,  972,
         975,  983,  994, 1004, 1010, 1076, 1107, 1117, 1123, 1136, 1140,
        1141, 1146, 1148, 1167, 1169, 1242, 1254, 1261, 1284, 1290, 1329,
        1397, 1412, 1434, 1455, 1483, 1489, 1505, 1510, 1514, 1527, 1538,
        1627, 1639, 1644, 1676, 1681, 1704, 1711, 1726, 1747, 1749, 1752,
        1774, 1792, 1797, 1808, 1825, 1827, 1832, 1835, 1888, 1994, 2000,
        2011, 2020, 2022, 2033, 2055, 2059, 2066, 2070, 2085, 2089, 2092,
        2093, 2097, 2108, 2115, 2166, 2182, 2193, 2198, 2204, 2216, 2236,
        2248, 2255, 2261, 2276, 2339, 2348, 2357, 2364, 2371, 2380, 2389,
        2396, 2423, 2425, 2428, 2432, 

In [56]:
a = np.where(Xd_train.var()== 0)
b = np.where(Xd_test.var()== 0)
x = np.append(a, b)
x = np.unique(x)

In [57]:
Xd_train = Xd_train.drop(x, axis = 1)
Xd_test = Xd_test.drop(x, axis = 1)

# Standardizing

In [58]:
scaler1 = StandardScaler(with_std = False)
scaler2 = StandardScaler(with_std = False)

scaler1.fit(Xa_train)
scaler2.fit(Xd_train)

columns1 = Xa_train.columns
columns2 = Xd_train.columns

Xa_train = pd.DataFrame(scaler1.transform(Xa_train))
Xa_test = pd.DataFrame(scaler1.transform(Xa_test))

Xd_train = pd.DataFrame(scaler2.transform(Xd_train))
Xd_test = pd.DataFrame(scaler2.transform(Xd_test))

Xa_train.columns = columns1
Xa_test.columns = columns1

Xd_train.columns = columns2
Xd_test.columns = columns2


# Feature selection methods - implementations

In [328]:
# removed up to 2*k features having lowest variance (union of 'up to k from train' and 'up to k from test')
def rm2KofLowestVariance(data_train, data_test, k):
    variances = data_train.var()
    sorted_variances = np.sort(variances)
    if(len(sorted_variances) < k + 1):
        return
    threshold = sorted_variances[k]
    aux = data_train.columns[np.where(variances < threshold)]

    variances = data_test.var()
    sorted_variances = np.sort(variances)
    if(len(sorted_variances) < k + 1):
        return
    threshold = sorted_variances[k]
    aux2 = data_test.columns[np.where(variances < threshold)]

    x = np.append(aux, aux2)
    x = np.unique(x)

    return data_train.drop(x,axis = 1), data_test.drop(x,axis = 1)

# removed up to 2*k features having lowest p_value of normality test (union of 'up to k from train' and 'up to k from test')
def rm2KofFeaturesFromNormalDistribution(data_train, data_test, k):
    tt = normaltest(data_train)
    p_values = tt[1]
    sorted_p = np.sort(p_values)
    if(len(sorted_p) < k + 1):
        return
    threshold = sorted_p[k]
    aux = data_train.columns[np.where(p_values < threshold)]

    tt = normaltest(data_test)
    p_values = tt[1]
    sorted_p = np.sort(p_values)
    if(len(sorted_p) < k + 1):
        return
    threshold = sorted_p[k]
    aux2 = data_test.columns[np.where(p_values < threshold)]

    x = np.append(aux, aux2)
    x = np.unique(x)

    return data_train.drop(x,axis = 1), data_test.drop(x,axis = 1)


In [342]:
def applyBorutaSelection(data_train, labels, data_test, classifModel):
    # define Boruta feature selection method
    feat_selector = BorutaPy(classifModel, n_estimators='auto', random_state=110)

    # find all relevant features - 5 features should be selected
    feat_selector.fit(data_train.values, labels.values.ravel())

    return data_train.iloc[:,feat_selector.support_], data_test.iloc[:,feat_selector.support_]

In [382]:
def leaveKusingVIF(data_train, data_test, k):
    while k < len(data_train.columns):
        ds=pd.Series([variance_inflation_factor(data_train.values, i)   
           for i in range(data_train.shape[1])],   
              index=data_train.columns)  
        ds2=pd.Series([variance_inflation_factor(data_test.values, i)   
               for i in range(data_test.shape[1])],   
              index=data_test.columns)

        if ds.values[np.argmax(ds)] >= ds2.values[np.argmax(ds2)]:
            col_to_be_deleted = data_train.columns[np.argmax(ds)]
        else:
            col_to_be_deleted = data_train.columns[np.argmax(ds2)]
        data_train = data_train.drop(col_to_be_deleted,axis=1)
        data_test = data_test.drop(col_to_be_deleted,axis=1)
    return data_train, data_test

def leaveKusingVIF_QUICK(data_train, data_test, k):
    while k < len(data_train.columns):
        ds=pd.Series([variance_inflation_factor(data_train.values, i)   
           for i in range(data_train.shape[1])],   
              index=data_train.columns)  
        col_to_be_deleted = data_train.columns[np.argmax(ds)]

        data_train = data_train.drop(col_to_be_deleted,axis=1)
        data_test = data_test.drop(col_to_be_deleted,axis=1)
    return data_train, data_test
    

In [259]:
dd1, dd2 = leaveKusingVIF(df1, df2, 5)

In [260]:
dd1

Unnamed: 0,48,204,318,336,433
0,-45.0005,-4.9565,-72.4,177.126,34.273
1,13.9995,6.0435,-17.4,-11.874,-56.727
2,-25.0005,-16.9565,19.6,-15.874,70.273
3,43.9995,-24.9565,-28.4,-82.874,66.273
4,-56.0005,19.0435,-24.4,-95.874,-36.727
...,...,...,...,...,...
1995,19.9995,39.0435,-65.4,-17.874,104.273
1996,-37.0005,18.0435,57.6,-238.874,-23.727
1997,58.9995,34.0435,57.6,57.126,72.273
1998,4.9995,21.0435,-26.4,67.126,109.273


# Defining Scores

In [62]:

# m - number of features in use
def score1(y_pred, y_true, m):
    BA = balanced_accuracy_score(y_true, y_pred) # changement of order
    if m > 5:
        BA = BA - 0.01*(m/5 - 1)
    return BA
def score2(y_pred, y_true, m):
    BA = balanced_accuracy_score(y_true, y_pred) # changement of order
    if m > 50:
        BA = BA - 0.01*(m/200 - 0.25)
    return BA
    
        

In [11]:
np.mean(np.unique([1,1,1,1,1,-1,-1,-1], return_counts = True)[0] == [-1,1])

1.0

# Defining models

In [None]:
# repeated
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier

In [None]:
# RANDOM SEEDS!!!

In [16]:
rf = RandomForestClassifier(n_estimators = 50, random_state = 110, n_jobs= -1, max_depth = 5)
xg = GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 50, max_depth = 5, random_state = 110)
lr = LogisticRegression(C = 1.0, random_state = 110)
svc = CalibratedClassifierCV(LinearSVC(random_state = 110, C = 1.0, max_iter= 1000))



In [115]:
models_dict = {
    "model1_rf": RandomForestClassifier(n_estimators = 20, random_state = 110, n_jobs= -1, max_depth = 4),
    "model2_rf": RandomForestClassifier(n_estimators = 50, random_state = 110, n_jobs= -1, max_depth = 5),
    "model3_rf": RandomForestClassifier(n_estimators = 80, random_state = 110, n_jobs= -1, max_depth = 6),
    "model1_xg": GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 20, max_depth = 4, random_state = 110),
    "model2_xg": GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 50, max_depth = 5, random_state = 110),
    "model3_xg": GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 80, max_depth = 6, random_state = 110),
    "model4_xg": GradientBoostingClassifier(learning_rate = 0.01, n_estimators = 50, max_depth = 5, random_state = 110),
    "model5_xg": GradientBoostingClassifier(learning_rate = 0.3, n_estimators = 50, max_depth = 5, random_state = 110),
    "model1_lr": LogisticRegression(C = 0.4, random_state = 110),
    "model2_lr": LogisticRegression(C = 1.0, random_state = 110),
    "model3_lr": LogisticRegression(C = 2.5, random_state = 110),
    "model1_svc": CalibratedClassifierCV(LinearSVC(random_state = 110, C = 0.4, max_iter= 1000)),
    "model2_svc": CalibratedClassifierCV(LinearSVC(random_state = 110, C = 1.0, max_iter= 1000)),
    "model3_svc": CalibratedClassifierCV(LinearSVC(random_state = 110, C = 2.5, max_iter= 1000)),
    "model1_ensemble_rfxglr": VotingClassifier([("m1",RandomForestClassifier(n_estimators = 80, random_state = 110, n_jobs= -1, max_depth = 5)),
                                         ("m2",GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 50, max_depth = 5, random_state = 110)),
                                               ("m3", LogisticRegression(C = 1.0, random_state = 110))], 
                                        voting='soft'),
    "model2_ensemble_rflrsvc": VotingClassifier([("m1",RandomForestClassifier(n_estimators = 80, random_state = 110, n_jobs= -1, max_depth = 6)),
                                         ("m2",CalibratedClassifierCV(LinearSVC(random_state = 110, C = 1.0, max_iter= 1000))),
                                               ("m3",LogisticRegression(C = 1.0, random_state = 110))], 
                                        voting='soft'),
    "model3_ensemble_xglrsvc": VotingClassifier([("m1",GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 80, max_depth = 6, random_state = 110)),
                                         ("m2",CalibratedClassifierCV(LinearSVC(random_state = 110, C = 1.0, max_iter= 1000))),
                                               ("m3",LogisticRegression(C = 1.0, random_state = 110))], 
                                        voting='soft'),
    "model4_ensemble_rfxgsvc": VotingClassifier([("m1",GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 100, max_depth = 5, random_state = 110)),
                                         ("m2",CalibratedClassifierCV(LinearSVC(random_state = 110, C = 1.0, max_iter= 1000))),
                                               ("m3",RandomForestClassifier(n_estimators = 100, random_state = 110, n_jobs= -1, max_depth = 5))], 
                                        voting='soft')
}

In [139]:

variance_ks = [0,5,15,25,35,50,100]
normality_ks = [0,5,15,25,35,50,100]
boruta_flags = [True] # always use boruta
vif_ks = [5,6,7,8,9,10,12,15]

feature_selection_cases = np.array(np.meshgrid(variance_ks, normality_ks, boruta_flags, vif_ks)).T.reshape(-1,4)





In [140]:
feature_selection_cases.shape

(392, 4)

# Train/test split - 75%/25%

In [67]:
X1_train, X1_val, y1_train, y1_val =  train_test_split(Xa_train, ya_train,train_size = 0.75, random_state = 110)

In [68]:
X1_train.shape

(1500, 500)

In [69]:
y1_train.shape

(1500, 1)

In [70]:
X1_val.shape

(500, 500)

In [71]:
y1_val.shape

(500, 1)

# Learning

In [87]:
df = pd.DataFrame({"model_name": {},"variance_k":{}, "normality_k":{}, "boruta_flag": {}, "vif_k":{}, "score": {}, "chosen_columns":{}})

In [95]:
df.append({"model_name": 0, "variance_k": 0, "normality_k": 0, "boruta_flag": 0, "vif_k":0, "score": 0, "chosen_columns": 0}, ignore_index = True)

Unnamed: 0,model_name,variance_k,normality_k,boruta_flag,vif_k,score,chosen_columns
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
res = pd.DataFrame({"model_name": {},"variance_k":{}, "normality_k":{}, "boruta_flag": {}, "vif_k":{}, "score": {}, "chosen_columns": {}})

In [141]:
import warnings
warnings.filterwarnings("ignore")

random.seed(110)

data_train = X1_train
data_val = X1_val

# initial model for boruta
boruta_model = RandomForestClassifier(n_estimators = 50, random_state = 110, n_jobs= -1, max_depth = 5)

counter = 1
res = pd.DataFrame({"model_name": {},"variance_k":{}, "normality_k":{}, "boruta_flag": {}, "vif_k":{}, "score": {}, "chosen_columns": {}})


for case in feature_selection_cases:
    print("CASE: " + str(counter) + "/392")
    counter += 1
    
    # new data for each feature selection case:
    data_train = X1_train
    data_val = X1_val
    
    # FEATURE SELECTION METHODS:
    data_train, data_val = rm2KofLowestVariance(data_train, data_val, case[0])
    data_train, data_val = rm2KofFeaturesFromNormalDistribution(data_train, data_val, case[1])
    if case[2]:
        data_train, data_val = applyBorutaSelection(data_train, y1_train, data_val, boruta_model)
    data_train, data_val = leaveKusingVIF(data_train, data_val, case[3])
    
    chosen_columns = str(data_train.columns.values)[1:-1]
    
    m = data_train.shape[1]
    
    # STANDARDIZING
    scaler1 = StandardScaler()
    scaler1.fit(data_train)
    columns1 = data_train.columns
    data_train = pd.DataFrame(scaler1.transform(data_train))
    data_val = pd.DataFrame(scaler1.transform(data_val))
    data_train.columns = columns1
    data_val.columns = columns1    
    
    # ML learning
    print("Feature selection finished! m = ", m)
    
    for key, model in models_dict.items():
        model.fit(data_train, y1_train)
        predictions = model.predict(data_val)
        score = score1(predictions, y1_val, m)

        res = res.append({"model_name": key, "variance_k": case[0], "normality_k": case[1], "boruta_flag": case[2], "vif_k":case[3], "score": score, "chosen_columns": chosen_columns}, ignore_index = True)





CASE: 1/392
Feature selection finished! m =  5
CASE: 2/392
Feature selection finished! m =  5
CASE: 3/392
Feature selection finished! m =  5
CASE: 4/392
Feature selection finished! m =  5
CASE: 5/392
Feature selection finished! m =  5
CASE: 6/392
Feature selection finished! m =  5
CASE: 7/392
Feature selection finished! m =  4
CASE: 8/392
Feature selection finished! m =  5
CASE: 9/392
Feature selection finished! m =  5
CASE: 10/392
Feature selection finished! m =  5
CASE: 11/392
Feature selection finished! m =  5
CASE: 12/392
Feature selection finished! m =  5
CASE: 13/392
Feature selection finished! m =  5
CASE: 14/392
Feature selection finished! m =  5
CASE: 15/392
Feature selection finished! m =  5
CASE: 16/392
Feature selection finished! m =  5
CASE: 17/392
Feature selection finished! m =  5
CASE: 18/392
Feature selection finished! m =  5
CASE: 19/392
Feature selection finished! m =  5
CASE: 20/392
Feature selection finished! m =  5
CASE: 21/392
Feature selection finished! m =  5
C

CASE: 171/392
Feature selection finished! m =  8
CASE: 172/392
Feature selection finished! m =  8
CASE: 173/392
Feature selection finished! m =  8
CASE: 174/392
Feature selection finished! m =  6
CASE: 175/392
Feature selection finished! m =  6
CASE: 176/392
Feature selection finished! m =  8
CASE: 177/392
Feature selection finished! m =  8
CASE: 178/392
Feature selection finished! m =  8
CASE: 179/392
Feature selection finished! m =  8
CASE: 180/392
Feature selection finished! m =  8
CASE: 181/392
Feature selection finished! m =  7
CASE: 182/392
Feature selection finished! m =  5
CASE: 183/392
Feature selection finished! m =  8
CASE: 184/392
Feature selection finished! m =  8
CASE: 185/392
Feature selection finished! m =  8
CASE: 186/392
Feature selection finished! m =  8
CASE: 187/392
Feature selection finished! m =  8
CASE: 188/392
Feature selection finished! m =  7
CASE: 189/392
Feature selection finished! m =  5
CASE: 190/392
Feature selection finished! m =  8
CASE: 191/392
Featur

Feature selection finished! m =  12
CASE: 338/392
Feature selection finished! m =  12
CASE: 339/392
Feature selection finished! m =  11
CASE: 340/392
Feature selection finished! m =  9
CASE: 341/392
Feature selection finished! m =  8
CASE: 342/392
Feature selection finished! m =  7
CASE: 343/392
Feature selection finished! m =  4
CASE: 344/392
Feature selection finished! m =  15
CASE: 345/392
Feature selection finished! m =  13
CASE: 346/392
Feature selection finished! m =  11
CASE: 347/392
Feature selection finished! m =  10
CASE: 348/392
Feature selection finished! m =  8
CASE: 349/392
Feature selection finished! m =  8
CASE: 350/392
Feature selection finished! m =  4
CASE: 351/392
Feature selection finished! m =  15
CASE: 352/392
Feature selection finished! m =  12
CASE: 353/392
Feature selection finished! m =  11
CASE: 354/392
Feature selection finished! m =  9
CASE: 355/392
Feature selection finished! m =  9
CASE: 356/392
Feature selection finished! m =  7
CASE: 357/392
Feature se

In [145]:
res.sort_values("score", ascending = False).head(30) # with last step standardization

Unnamed: 0,model_name,variance_k,normality_k,boruta_flag,vif_k,score,chosen_columns
5171,model3_xg,100.0,0.0,1.0,10.0,0.847848,10 64 105 204 241 281 318 378 453 481
3533,model3_xg,0.0,0.0,1.0,9.0,0.845151,10 48 64 204 241 281 318 378 493
4163,model3_xg,50.0,0.0,1.0,9.0,0.845151,10 48 64 204 241 281 318 378 493
3911,model3_xg,25.0,0.0,1.0,9.0,0.845151,10 48 64 204 241 281 318 378 493
3785,model3_xg,15.0,0.0,1.0,9.0,0.845151,10 48 64 204 241 281 318 378 493
3659,model3_xg,5.0,0.0,1.0,9.0,0.845151,10 48 64 204 241 281 318 378 493
4037,model3_xg,35.0,0.0,1.0,9.0,0.845151,10 48 64 204 241 281 318 378 493
4813,model5_xg,25.0,5.0,1.0,10.0,0.84287,10 28 48 64 105 204 338 442 453 493
3155,model3_xg,35.0,0.0,1.0,8.0,0.840527,10 48 64 204 241 281 318 493
3281,model3_xg,50.0,0.0,1.0,8.0,0.840527,10 48 64 204 241 281 318 493


In [147]:
res.to_csv("results-artificial.csv")

# Calculating feature counts

In [163]:
aux = []

splitted_columns = res.loc[:,"chosen_columns"].map(lambda x: x.split(" "))

for sc in splitted_columns:
    aux = np.append(aux, sc)
    
aux2 = np.unique(aux, return_counts = True)

In [166]:
aux2

(array(['', '10', '105', '128', '153', '204', '241', '28', '281', '298',
        '318', '336', '338', '378', '410', '433', '442', '451', '453',
        '472', '48', '481', '493', '64'], dtype='<U32'),
 array([15138,  7056,  5130,  4770,   234,  7056,   612,   162,   738,
          288,  1008,  2232,  3906,   576,   144,   126,  1674,   324,
         1962,  2826,  1890,  3168,  1656,  6030], dtype=int64))

In [167]:
aux2[1]

array([15138,  7056,  5130,  4770,   234,  7056,   612,   162,   738,
         288,  1008,  2232,  3906,   576,   144,   126,  1674,   324,
        1962,  2826,  1890,  3168,  1656,  6030], dtype=int64)

In [173]:
u, count = aux2

count_sort_ind = np.argsort(-count)

best_columns = u[count_sort_ind][1:]

best_columns

array(['10', '204', '64', '105', '128', '338', '481', '472', '336', '453',
       '48', '442', '493', '318', '281', '241', '378', '451', '298',
       '153', '28', '410', '433'], dtype='<U32')

In [232]:
best_columns.astype(int)

array([ 10, 204,  64, 105, 128, 338, 481, 472, 336, 453,  48, 442, 493,
       318, 281, 241, 378, 451, 298, 153,  28, 410, 433])

# Hyperparameter tuning

In [233]:
# grid search:

learning_rates = [0.001,0.005, 0.01, 0.05, 0.1, 0.5]
n_ests = [10,30,50,70,90,100,120,150]
max_depths = [3,4,5,6,7,8]

ht_cases = np.array(np.meshgrid(learning_rates, n_ests, max_depths)).T.reshape(-1,3)


In [240]:
len(ht_cases)



0

In [241]:
import warnings
warnings.filterwarnings("ignore")

random.seed(110)

data_train = X1_train
data_val = X1_val

counter = 1
res2 = pd.DataFrame({"lr": {},"n_est":{}, "max_depth":{}, "m": {}, "score": {}})


for m in range(5,13):
    print("m: " + str(m) + "/12")
    counter += 1
    
    # new data for each feature selection case:
    data_train = X1_train
    data_val = X1_val
    
    # pick m best columns
    
    data_train = data_train.iloc[:, best_columns[0:m].astype(int)]
    data_val = data_val.iloc[:, best_columns[0:m].astype(int)]

    # STANDARDIZING
    scaler1 = StandardScaler()
    scaler1.fit(data_train)
    columns1 = data_train.columns
    data_train = pd.DataFrame(scaler1.transform(data_train))
    data_val = pd.DataFrame(scaler1.transform(data_val))
    data_train.columns = columns1
    data_val.columns = columns1    
    
    # hyperparam tuning
    happy_counter = 0
    for case in ht_cases:
        
        if happy_counter % 50 == 0:
            print("PROGRESS: " + str(100*happy_counter/len(ht_cases)) + " %")
        happy_counter += 1
        
        # getting best model from feature selection previous step
        xgboost = GradientBoostingClassifier(learning_rate = case[0], n_estimators = case[1].astype(int), max_depth = case[2].astype(int), random_state = 110)
        xgboost.fit(data_train, y1_train)
        predictions = xgboost.predict(data_val)
        score = score1(predictions, y1_val, m)

        res2 = res2.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": score}, ignore_index = True)





m: 5/12
PROGRESS: 0.0 %
PROGRESS: 17.36111111111111 %
PROGRESS: 34.72222222222222 %
PROGRESS: 52.083333333333336 %
PROGRESS: 69.44444444444444 %
PROGRESS: 86.80555555555556 %
m: 6/12
PROGRESS: 0.0 %
PROGRESS: 17.36111111111111 %
PROGRESS: 34.72222222222222 %
PROGRESS: 52.083333333333336 %
PROGRESS: 69.44444444444444 %
PROGRESS: 86.80555555555556 %
m: 7/12
PROGRESS: 0.0 %
PROGRESS: 17.36111111111111 %
PROGRESS: 34.72222222222222 %
PROGRESS: 52.083333333333336 %
PROGRESS: 69.44444444444444 %
PROGRESS: 86.80555555555556 %
m: 8/12
PROGRESS: 0.0 %
PROGRESS: 17.36111111111111 %
PROGRESS: 34.72222222222222 %
PROGRESS: 52.083333333333336 %
PROGRESS: 69.44444444444444 %
PROGRESS: 86.80555555555556 %
m: 9/12
PROGRESS: 0.0 %
PROGRESS: 17.36111111111111 %
PROGRESS: 34.72222222222222 %
PROGRESS: 52.083333333333336 %
PROGRESS: 69.44444444444444 %
PROGRESS: 86.80555555555556 %
m: 10/12
PROGRESS: 0.0 %
PROGRESS: 17.36111111111111 %
PROGRESS: 34.72222222222222 %
PROGRESS: 52.083333333333336 %
PROGRESS:

In [243]:
res2.sort_values("score", ascending = False).head(20)

Unnamed: 0,lr,n_est,max_depth,m,score
2295,0.1,150.0,8.0,12.0,0.845012
2010,0.5,50.0,8.0,11.0,0.844644
2009,0.5,30.0,8.0,11.0,0.839224
2013,0.5,100.0,8.0,11.0,0.837097
2287,0.05,150.0,8.0,12.0,0.835779
2294,0.1,120.0,8.0,12.0,0.835097
2291,0.1,70.0,8.0,12.0,0.834856
2303,0.5,150.0,8.0,12.0,0.834615
2012,0.5,90.0,8.0,11.0,0.833082
2003,0.1,70.0,8.0,11.0,0.833082


In [250]:
res2.to_csv("results2-artificial.csv")

# Step 2 - hyperparameter tuning

In [245]:
# grid search:

learning_rates = [0.1, 0.3, 0.5, 0.7, 0.9]
n_ests = [100, 150, 200, 300, 400]
max_depths = [7,9,11,13]

ht_cases = np.array(np.meshgrid(learning_rates, n_ests, max_depths)).T.reshape(-1,3)

In [246]:
len(ht_cases)

100

In [247]:
column_cases = {"case1": best_columns[0:5].astype(int),
               "case2": best_columns[0:6].astype(int),
               "case3": best_columns[0:7].astype(int),
               "case4": best_columns[0:8].astype(int),
               "case5": best_columns[0:9].astype(int),
               "case6": best_columns[0:10].astype(int),
               "case7": best_columns[0:11].astype(int),
               "case8": best_columns[0:12].astype(int),
               "case9": best_columns[0:13].astype(int),
               "case10": best_columns[0:14].astype(int),
               "case11": best_columns[0:15].astype(int),
               "case12": [10, 48 ,64, 204, 241, 281, 318, 378, 493],
               "case13": [10, 64, 105, 204 ,241 ,281 ,318, 378, 453, 481],
               "case14": [10, 28, 48, 64, 105, 204, 338, 442, 453, 493]}

In [251]:
import warnings
warnings.filterwarnings("ignore")

random.seed(110)

data_train = X1_train
data_val = X1_val

res3 = pd.DataFrame({"lr": {},"n_est":{}, "max_depth":{}, "m": {}, "score": {}, "case": {}, "standardize": {}})


for case_col, best_cols in column_cases.items():
    print("case: " + case_col)
    
    # new data for each feature selection case:
    data_train = X1_train
    data_val = X1_val
    
    # pick m best columns
    
    data_train = data_train.iloc[:, best_cols]
    data_val = data_val.iloc[:, best_cols]

    # STANDARDIZING
    scaler1 = StandardScaler()
    scaler1.fit(data_train)
    columns1 = data_train.columns
    data_trains = pd.DataFrame(scaler1.transform(data_train))
    data_vals = pd.DataFrame(scaler1.transform(data_val))
    data_trains.columns = columns1
    data_vals.columns = columns1    
    
    # hyperparam tuning
    happy_counter = 0
    for case in ht_cases:
        
        if happy_counter % 50 == 0:
            print("PROGRESS: " + str(100*happy_counter/len(ht_cases)) + " %")
        happy_counter += 1
        
        # getting best model from feature selection previous step
        xgboost = GradientBoostingClassifier(learning_rate = case[0], n_estimators = case[1].astype(int), max_depth = case[2].astype(int), random_state = 110)
        xgboost.fit(data_train, y1_train)
        predictions = xgboost.predict(data_val)
        score = score1(predictions, y1_val, m)

        res3 = res3.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": score, "case": case_col, "standardize": False}, ignore_index = True)

        xgboost.fit(data_trains, y1_train)
        predictions = xgboost.predict(data_vals)
        score = score1(predictions, y1_val, m)

        res3 = res3.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": score, "case": case_col, "standardize": True}, ignore_index = True)





case: case1
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case2
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case3
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case4
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case5
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case6
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case7
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case8
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case9
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case10
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case11
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case12
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case13
PROGRESS: 0.0 %
PROGRESS: 50.0 %
case: case14
PROGRESS: 0.0 %
PROGRESS: 50.0 %


In [253]:
res3.to_csv("results3-artificial.csv")

In [272]:
res3.sort_values("score", ascending = False).head(20)

Unnamed: 0,lr,n_est,max_depth,m,score,case,standardize
2763,0.3,150.0,13.0,12.0,0.882788,case14,1.0
2762,0.3,150.0,13.0,12.0,0.880661,case14,0.0
2713,0.3,150.0,11.0,12.0,0.875723,case14,1.0
2712,0.3,150.0,11.0,12.0,0.873595,case14,0.0
2130,0.7,100.0,11.0,12.0,0.873114,case11,0.0
2707,0.1,300.0,11.0,12.0,0.873114,case14,1.0
2706,0.1,300.0,11.0,12.0,0.873114,case14,0.0
2704,0.1,200.0,11.0,12.0,0.873114,case14,0.0
2042,0.9,150.0,7.0,12.0,0.873114,case11,0.0
2043,0.9,150.0,7.0,12.0,0.873114,case11,1.0


# Hyperparameter Tuning 3

In [261]:
# grid search:

learning_rates = [0.23, 0.25, 0.28, 0.3, 0.32, 0.35, 0.38]
n_ests = [145, 150, 155]
max_depths = [13,14,15,16,17,18,19,20,21,22]

ht_cases = np.array(np.meshgrid(learning_rates, n_ests, max_depths)).T.reshape(-1,3)

column_cases = {"case14": [10, 28, 48, 64, 105, 204, 338, 442, 453, 493]}


In [260]:
len(ht_cases)

231

In [262]:
import warnings
warnings.filterwarnings("ignore")

random.seed(110)

data_train = X1_train
data_val = X1_val

res3_2 = pd.DataFrame({"lr": {},"n_est":{}, "max_depth":{}, "m": {}, "score": {}, "case": {}, "standardize": {}})


for case_col, best_cols in column_cases.items():
    print("case: " + case_col)
    
    # new data for each feature selection case:
    data_train = X1_train
    data_val = X1_val
    
    # pick m best columns
    
    data_train = data_train.iloc[:, best_cols]
    data_val = data_val.iloc[:, best_cols]

    # STANDARDIZING
    scaler1 = StandardScaler()
    scaler1.fit(data_train)
    columns1 = data_train.columns
    data_trains = pd.DataFrame(scaler1.transform(data_train))
    data_vals = pd.DataFrame(scaler1.transform(data_val))
    data_trains.columns = columns1
    data_vals.columns = columns1    
    
    # hyperparam tuning
    happy_counter = 0
    for case in ht_cases:
        
        if happy_counter % 50 == 0:
            print("PROGRESS: " + str(100*happy_counter/len(ht_cases)) + " %")
        happy_counter += 1
        
        # getting best model from feature selection previous step
        xgboost = GradientBoostingClassifier(learning_rate = case[0], n_estimators = case[1].astype(int), max_depth = case[2].astype(int), random_state = 110)
        xgboost.fit(data_train, y1_train)
        predictions = xgboost.predict(data_val)
        score = score1(predictions, y1_val, m)

        res3_2 = res3_2.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": score, "case": case_col, "standardize": False}, ignore_index = True)

        xgboost.fit(data_trains, y1_train)
        predictions = xgboost.predict(data_vals)
        score = score1(predictions, y1_val, m)

        res3_2 = res3_2.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": score, "case": case_col, "standardize": True}, ignore_index = True)



case: case14
PROGRESS: 0.0 %
PROGRESS: 23.80952380952381 %
PROGRESS: 47.61904761904762 %
PROGRESS: 71.42857142857143 %
PROGRESS: 95.23809523809524 %


In [264]:
res3_2.sort_values("score", ascending = False).head(10)

Unnamed: 0,lr,n_est,max_depth,m,score,case,standardize
21,0.3,150.0,13.0,12.0,0.882788,case14,1.0
23,0.3,155.0,13.0,12.0,0.880661,case14,1.0
20,0.3,150.0,13.0,12.0,0.880661,case14,0.0
19,0.3,145.0,13.0,12.0,0.880661,case14,1.0
22,0.3,155.0,13.0,12.0,0.878533,case14,0.0
18,0.3,145.0,13.0,12.0,0.878533,case14,0.0
41,0.38,155.0,13.0,12.0,0.827068,case14,1.0
83,0.38,155.0,14.0,12.0,0.826345,case14,1.0
39,0.38,150.0,13.0,12.0,0.825181,case14,1.0
40,0.38,155.0,13.0,12.0,0.82494,case14,0.0


# Optimal train_set size

In [299]:
training_sizes = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]


training_sizes = [0.7, 0.8, 0.9,0.95]

In [277]:
import warnings
warnings.filterwarnings("ignore")

#random.seed(110)

best_cols = [10, 28, 48, 64, 105, 204, 338, 442, 453, 493]

res4 = pd.DataFrame({"train_size":{}, "score": {}, "standardize": {}})
m = len(best_cols)
scaler1 = StandardScaler()

for ts in training_sizes:
    
    print("train_size: " + str(ts))
    
    for i in range(30):
        data_train, data_val, y1_train, y1_val =  train_test_split(Xa_train, ya_train,train_size = ts)

        # pick m best columns
        data_train = data_train.iloc[:, best_cols]
        data_val = data_val.iloc[:, best_cols]

        # STANDARDIZING

        scaler1.fit(data_train)
        columns1 = data_train.columns
        data_trains = pd.DataFrame(scaler1.transform(data_train))
        data_vals = pd.DataFrame(scaler1.transform(data_val))
        data_trains.columns = columns1
        data_vals.columns = columns1    


        # getting best model from feature selection previous step
        xgboost = GradientBoostingClassifier(learning_rate = 0.2, n_estimators = 50, max_depth = 10)
        xgboost.fit(data_train, y1_train)
        predictions = xgboost.predict(data_val)
        score = score1(predictions, y1_val, m)

        res4 = res4.append({"train_size": ts,  "score": score, "standardize": False}, ignore_index = True)

        xgboost.fit(data_trains, y1_train)
        predictions = xgboost.predict(data_vals)
        score = score1(predictions, y1_val, m)

        res4 = res4.append({"train_size": ts,  "score": score, "standardize": True}, ignore_index = True)




train_size: 0.05
train_size: 0.1
train_size: 0.15
train_size: 0.2
train_size: 0.25
train_size: 0.3
train_size: 0.35
train_size: 0.4
train_size: 0.45
train_size: 0.5
train_size: 0.55
train_size: 0.6
train_size: 0.65
train_size: 0.7
train_size: 0.75
train_size: 0.8
train_size: 0.85
train_size: 0.9
train_size: 0.95


In [300]:
import warnings
warnings.filterwarnings("ignore")

#random.seed(110)

best_cols = [10, 28, 48, 64, 105, 204, 338, 442, 453, 493]

res5 = pd.DataFrame({"train_size":{}, "score": {}, "standardize": {}})
m = len(best_cols)
scaler1 = StandardScaler()

for ts in training_sizes:
    
    print("train_size: " + str(ts))
    
    for i in range(30):
        data_train, data_val, y1_train, y1_val =  train_test_split(Xa_train, ya_train,train_size = ts)

        # pick m best columns
        data_train = data_train.iloc[:, best_cols]
        data_val = data_val.iloc[:, best_cols]

        # STANDARDIZING

        scaler1.fit(data_train)
        columns1 = data_train.columns
        data_trains = pd.DataFrame(scaler1.transform(data_train))
        data_vals = pd.DataFrame(scaler1.transform(data_val))
        data_trains.columns = columns1
        data_vals.columns = columns1    


        # getting best model from feature selection previous step
        xgboost = GradientBoostingClassifier(learning_rate = 0.2, n_estimators = 50, max_depth = 10)
        xgboost.fit(data_train, y1_train)
        predictions = xgboost.predict(data_val)
        score = score1(predictions, y1_val, m)

        res5 = res5.append({"train_size": ts,  "score": score, "standardize": False}, ignore_index = True)

        xgboost.fit(data_trains, y1_train)
        predictions = xgboost.predict(data_vals)
        score = score1(predictions, y1_val, m)

        res5 = res5.append({"train_size": ts,  "score": score, "standardize": True}, ignore_index = True)




train_size: 0.7
train_size: 0.8
train_size: 0.9
train_size: 0.95


In [303]:
df = res5.groupby(["train_size", "standardize"])["score"].agg(np.mean)

pd.DataFrame(df).sort_values("score", ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,score
train_size,standardize,Unnamed: 2_level_1
0.95,0.0,0.864044
0.95,1.0,0.858357
0.9,0.0,0.851485
0.9,1.0,0.850095
0.8,1.0,0.849704
0.8,0.0,0.84874
0.7,1.0,0.842715
0.7,0.0,0.841353


In [298]:
res4.to_csv("results4-artificial.csv")

In [288]:
df = res4.groupby(["train_size", "standardize"])["score"].agg(np.mean)

In [290]:
df = pd.DataFrame(df).sort_values("score", ascending = False)

In [295]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,score
train_size,standardize,Unnamed: 2_level_1
0.95,0.0,0.832304
0.95,1.0,0.824567
0.9,0.0,0.817705
0.8,1.0,0.808058
0.9,1.0,0.80759
0.85,1.0,0.804915
0.85,0.0,0.801299
0.8,0.0,0.79742
0.7,0.0,0.795908
0.7,1.0,0.795809


In [276]:
res4.sort_values("score", ascending = False)

Unnamed: 0,train_size,score,standardize
85,0.75,0.886788,1.0
84,0.75,0.884661,0.0
96,0.85,0.872870,0.0
97,0.85,0.872870,1.0
99,0.85,0.862747,1.0
...,...,...,...
10,0.10,0.595412,0.0
7,0.10,0.595368,1.0
6,0.10,0.595368,0.0
8,0.10,0.591056,0.0


Model seems to be overfitting - Score of 0.88 is too optimistic, and max_depth too high to be used in test dataset.

# Hyperparameter tuning - omitting random seed (bacause of overfitting on validation dataset), replacing it with average of scores

In [336]:
# grid search:

learning_rates = [0.05, 0.15, 0.3, 0.5]
n_ests = [10,20,30,40,50,60,70]
max_depths = [3,4,5,6,7]

ht_cases = np.array(np.meshgrid(learning_rates, n_ests, max_depths)).T.reshape(-1,3)

In [310]:
column_cases = {"case1": best_columns[0:5].astype(int),
               "case2": best_columns[0:6].astype(int),
               "case3": best_columns[0:7].astype(int),
               "case4": best_columns[0:8].astype(int),
               "case5": best_columns[0:9].astype(int),
               "case6": best_columns[0:10].astype(int),
               "case7": best_columns[0:11].astype(int),
               "case8": best_columns[0:12].astype(int),
               "case9": best_columns[0:13].astype(int),
               "case10": best_columns[0:14].astype(int),
               "case11": best_columns[0:15].astype(int),
               "case12": [10, 48 ,64, 204, 241, 281, 318, 378, 493],
               "case13": [10, 64, 105, 204 ,241 ,281 ,318, 378, 453, 481],
               "case14": [10, 28, 48, 64, 105, 204, 338, 442, 453, 493]}

In [385]:

# grid search:

learning_rates = [0.05, 0.15, 0.3, 0.5]
n_ests = [10,20,30,40,50,60,70]
max_depths = [3,4,5,6,7]

ht_cases = np.array(np.meshgrid(learning_rates, n_ests, max_depths)).T.reshape(-1,3)

column_cases = {"case1": best_columns[0:5].astype(int),
               "case2": best_columns[0:6].astype(int),
               "case3": best_columns[0:7].astype(int),
               "case4": best_columns[0:8].astype(int),
               "case5": best_columns[0:9].astype(int),
               "case6": best_columns[0:10].astype(int),
               "case7": best_columns[0:11].astype(int),
               "case8": best_columns[0:12].astype(int),
               "case9": best_columns[0:13].astype(int),
               "case10": best_columns[0:14].astype(int),
               "case11": best_columns[0:15].astype(int),
               "case12": [10, 48 ,64, 204, 241, 281, 318, 378, 493],
               "case13": [10, 64, 105, 204 ,241 ,281 ,318, 378, 453, 481],
               "case14": [10, 28, 48, 64, 105, 204, 338, 442, 453, 493]}


import warnings
warnings.filterwarnings("ignore")

# random.seed(110)

res5_2 = pd.DataFrame({"lr": {},"n_est":{}, "max_depth":{}, "m": {}, "score": {}, "case": {}, "standardize": {}})

scaler1 = StandardScaler()

for case_col, best_cols in column_cases.items():
    print("case: " + case_col)
    
    m = len(best_cols)

    # STANDARDIZING
    
    #     scaler1.fit(data_train)
    #     columns1 = data_train.columns
    #     data_trains = pd.DataFrame(scaler1.transform(data_train))
    #     data_vals = pd.DataFrame(scaler1.transform(data_val))
    #     data_trains.columns = columns1
    #     data_vals.columns = columns1    
    
    # hyperparam tuning
    happy_counter = 0
    for case in ht_cases:
        
        if happy_counter % 50 == 0:
            print("PROGRESS: " + str(100*happy_counter/len(ht_cases)) + " %")
        happy_counter += 1

        scores = []
        scoress = []

        for i in range(30):
            
            data_train_, data_val_, y1_train, y1_val =  train_test_split(Xa_train, ya_train,train_size = ts)

            # pick m best columns
            data_train_ = data_train_.iloc[:, best_cols]
            data_val_ = data_val_.iloc[:, best_cols]
            
            # STANDARDIZING

            scaler1.fit(data_train_)
            columns1 = data_train_.columns
            data_trains = pd.DataFrame(scaler1.transform(data_train_))
            data_vals = pd.DataFrame(scaler1.transform(data_val_))
            data_trains.columns = columns1
            data_vals.columns = columns1   

            

            # getting best model from feature selection previous step
            xgboost = GradientBoostingClassifier(learning_rate = case[0], n_estimators = case[1].astype(int), max_depth = case[2].astype(int))
            xgboost.fit(data_train_, y1_train)
            predictions = xgboost.predict(data_val_)
            score = score1(predictions, y1_val, m)
            scores.append(score)


            xgboost.fit(data_trains, y1_train)
            predictions = xgboost.predict(data_vals)
            score = score1(predictions, y1_val, m)
            scoress.append(score)
            
        res5_2 = res5_2.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": np.mean(scores), "case": case_col, "standardize": False}, ignore_index = True)
        res5_2 = res5_2.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": np.mean(scoress), "case": case_col, "standardize": True}, ignore_index = True)


res5_2.to_csv("results5_2-artificial.csv")


case: case1
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case2
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case3
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case4
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case5
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case6
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case7
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case8
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case9
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case10
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case11
PROGRESS: 0.0 %
PROGRESS: 35.714285714285715 %
PROGRESS: 71.42857142857143 %
case: case12
PROGRE

In [426]:
res5_2.sort_values("score", ascending = False)

Unnamed: 0,lr,n_est,max_depth,m,score,case,standardize
3891,0.15,70.0,7.0,10.0,0.863337,case14,1.0
2765,0.15,40.0,7.0,14.0,0.861726,case10,1.0
3888,0.15,60.0,7.0,10.0,0.859985,case14,0.0
3621,0.30,50.0,7.0,10.0,0.859855,case13,1.0
3890,0.15,70.0,7.0,10.0,0.858655,case14,0.0
...,...,...,...,...,...,...,...
0,0.05,10.0,3.0,5.0,0.582403,case1,0.0
273,0.50,40.0,7.0,5.0,0.581454,case1,1.0
28,0.30,10.0,3.0,5.0,0.579912,case1,0.0
29,0.30,10.0,3.0,5.0,0.579570,case1,1.0


In [329]:
# special version for digits dataset - removes features with p value = 0 (there was plenty of them)
def rm2KofFeaturesFromNormalDistribution2(data_train, data_test):
    tt = normaltest(data_train)
    p_values = tt[1]

    aux = data_train.columns[np.where(p_values == 0)]

    tt = normaltest(data_test)
    p_values = tt[1]

    aux2 = data_test.columns[np.where(p_values == 0)]

    x = np.append(aux, aux2)
    x = np.unique(x)

    return data_train.drop(x,axis = 1), data_test.drop(x,axis = 1)

In [335]:

variance_ks = [0,10,20,40,80,150,300, 500]
normality_ks = [True, False]
boruta_flags = [True] # always use boruta
vif_ks = [50,52,54,56,58,60, 65, 70, 80, 100]

feature_selection_cases = np.array(np.meshgrid(variance_ks, normality_ks, boruta_flags, vif_ks)).T.reshape(-1,4)


160

In [345]:
# do not use - very slow - skip it

import warnings
warnings.filterwarnings("ignore")

#random.seed(110)


# initial model for boruta
boruta_model = RandomForestClassifier(n_estimators = 40, random_state = 110, n_jobs= -1, max_depth = 5)

counter = 1
res6 = pd.DataFrame({"m": {}, "model_name": {},"variance_k":{}, "normality_k":{}, "boruta_flag": {}, "vif_k":{}, "score": {}, "chosen_columns": {}, "standardize": {}})
scaler1 = StandardScaler()

for case in feature_selection_cases:
    print("CASE: " + str(counter) + "/160")
    counter += 1
    
    # new data for each feature selection case:
    data_train = Xd_train
    data_val = Xd_test
    
    # FEATURE SELECTION METHODS:
    data_train, data_val = rm2KofLowestVariance(data_train, data_val, case[0])
    if case[1]:
        data_train, data_val = rm2KofFeaturesFromNormalDistribution2(data_train, data_val )
    if case[2]:
        data_train, data_val = applyBorutaSelection(data_train, yd_train, data_val, boruta_model)
    data_train, data_val = leaveKusingVIF(data_train, data_val, case[3])
    
    chosen_columns = str(data_train.columns.values)[1:-1]
    best_cols = data_train.columns.values
    
    m = data_train.shape[1]
    
    scores = []
    scoress = []
    
    print("Started ML phase! (30 iterations)")

    for i in range(30):

        data_train, data_val, y1_train, y1_val =  train_test_split(Xd_train, yd_train,train_size = 0.8)

        # pick m best columns
        data_train = data_train.iloc[:, best_cols]
        data_val = data_val.iloc[:, best_cols]

        # STANDARDIZING

        scaler1.fit(data_train)
        columns1 = data_train.columns
        data_trains = pd.DataFrame(scaler1.transform(data_train))
        data_vals = pd.DataFrame(scaler1.transform(data_val))
        data_trains.columns = columns1
        data_vals.columns = columns1   



        # getting best model from feature selection previous step
        xgboost = GradientBoostingClassifier(learning_rate = 0.2, n_estimators = 40, max_depth = 5)
        xgboost.fit(data_train, y1_train)
        predictions = xgboost.predict(data_val)
        score = score2(predictions, y1_val, m)
        scores.append(score)


        xgboost.fit(data_trains, y1_train)
        predictions = xgboost.predict(data_vals)
        score = score2(predictions, y1_val, m)
        scoress.append(score)

    res6 = res6.append({"m": m,"model_name": "xgb", "variance_k": case[0], "normality_k": case[1], "boruta_flag": case[2], "vif_k":case[3], "score": np.mean(scoress), "chosen_columns": chosen_columns, "standardize": True}, ignore_index = True)
    res6 = res6.append({"m": m,"model_name": "xgb", "variance_k": case[0], "normality_k": case[1], "boruta_flag": case[2], "vif_k":case[3], "score": np.mean(scores), "chosen_columns": chosen_columns, "standardize": False}, ignore_index = True)





CASE: 1/160


KeyboardInterrupt: 

In [360]:
Xd_train.shape

(6000, 4698)

In [361]:
data_val.shape

(1000, 606)

In [362]:
def applyBorutaSelection_(data_train, labels, data_test, classifModel):
    # define Boruta feature selection method
    feat_selector = BorutaPy(classifModel, n_estimators='auto', random_state=110, verbose=2)

    # find all relevant features - 5 features should be selected
    feat_selector.fit(data_train.values, labels.values.ravel())

    return data_train.iloc[:,feat_selector.support_], data_test.iloc[:,feat_selector.support_]

In [363]:
data_train, data_val = rm2KofLowestVariance(Xd_train, Xd_test, 500)
data_train, data_val = rm2KofFeaturesFromNormalDistribution2(data_train, data_val )
data_train, data_val = applyBorutaSelection_(data_train, yd_train, data_val, boruta_model)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	606
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	606
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	606
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	606
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	606
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	606
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	606
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	258
Tentative: 	33
Rejected: 	315
Iteration: 	9 / 100
Confirmed: 	258
Tentative: 	33
Rejected: 	315
Iteration: 	10 / 100
Confirmed: 	258
Tentative: 	33
Rejected: 	315
Iteration: 	11 / 100
Confirmed: 	258
Tentative: 	33
Rejected: 	315
Iteration: 	12 / 100
Confirmed: 	261
Tentative: 	30
Rejected: 	315
Iteration: 	13 / 100
Confirmed: 	261
Tentative: 	30
Rejected: 	315
Iteration: 	14 / 100
Confirmed: 	261
Tentative: 	30
Rejected: 	315
Iteration: 	15 / 100
Confirmed: 	261
Tentative: 	27
Rejected: 	318
Iteration: 	16 / 100
Conf

KeyboardInterrupt: 

In [384]:
import warnings
warnings.filterwarnings("ignore")

#random.seed(110)


res6 = pd.DataFrame({"m": {}, "model_name": {},"variance_k":{}, "normality_k":{}, "boruta_flag": {}, "vif_k":{}, "score": {}, "chosen_columns": {}, "standardize": {}})
scaler1 = StandardScaler()

for vif_k in range(260, 49, -5):
    print("vif_k: " + str(vif_k) )
    
    data_train, data_val = leaveKusingVIF_QUICK(data_train, data_val, vif_k)
    
    chosen_columns = str(data_train.columns.values)[1:-1]
    best_cols = data_train.columns.values
    
    m = data_train.shape[1]
    
    
    scores = []
    scoress = []
    
    print("Started ML phase! (10 iterations), m = " + str(m))

    for i in range(10):
        print(i)
        data_train_, data_val_, y1_train, y1_val =  train_test_split(data_train, yd_train,train_size = 0.8)


        # STANDARDIZING

        scaler1.fit(data_train_)
        columns1 = data_train_.columns
        data_trains = pd.DataFrame(scaler1.transform(data_train_))
        data_vals = pd.DataFrame(scaler1.transform(data_val_))
        data_trains.columns = columns1
        data_vals.columns = columns1   



        # getting best model from feature selection previous step
        xgboost = GradientBoostingClassifier(learning_rate = 0.2, n_estimators = 40, max_depth = 5)
        xgboost.fit(data_train_, y1_train)
        predictions = xgboost.predict(data_val_)
        score = score2(predictions, y1_val, m)
        scores.append(score)


        xgboost.fit(data_trains, y1_train)
        predictions = xgboost.predict(data_vals)
        score = score2(predictions, y1_val, m)
        scoress.append(score)

    res6 = res6.append({"m": m,"model_name": "xgb", "variance_k": 500, "normality_k": 1, "boruta_flag": 1, "vif_k":vif_k, "score": np.mean(scoress), "chosen_columns": chosen_columns, "standardize": True}, ignore_index = True)
    res6 = res6.append({"m": m,"model_name": "xgb", "variance_k": 500, "normality_k": 1, "boruta_flag": 1, "vif_k":vif_k, "score": np.mean(scores), "chosen_columns": chosen_columns, "standardize": False}, ignore_index = True)


res6.to_csv("results-digits.csv")


vif_k: 260
Started ML phase! (10 iterations), m = 260
0
1
2
3
4
5
6
7
8
9
vif_k: 255
Started ML phase! (10 iterations), m = 255
0
1
2
3
4
5
6
7
8
9
vif_k: 250
Started ML phase! (10 iterations), m = 250
0
1
2
3
4
5
6
7
8
9
vif_k: 245
Started ML phase! (10 iterations), m = 245
0
1
2
3
4
5
6
7
8
9
vif_k: 240
Started ML phase! (10 iterations), m = 240
0
1
2
3
4
5
6
7
8
9
vif_k: 235
Started ML phase! (10 iterations), m = 235
0
1
2
3
4
5
6
7
8
9
vif_k: 230
Started ML phase! (10 iterations), m = 230
0
1
2
3
4
5
6
7
8
9
vif_k: 225
Started ML phase! (10 iterations), m = 225
0
1
2
3
4
5
6
7
8
9
vif_k: 220
Started ML phase! (10 iterations), m = 220
0
1
2
3
4
5
6
7
8
9
vif_k: 215
Started ML phase! (10 iterations), m = 215
0
1
2
3
4
5
6
7
8
9
vif_k: 210
Started ML phase! (10 iterations), m = 210
0
1
2
3
4
5
6
7
8
9
vif_k: 205
Started ML phase! (10 iterations), m = 205
0
1
2
3
4
5
6
7
8
9
vif_k: 200
Started ML phase! (10 iterations), m = 200
0
1
2
3
4
5
6
7
8
9
vif_k: 195
Started ML phase! (10 itera

In [3]:
res6.to_csv("results-digits.csv")

#res6 = pd.read_csv("results-digits.csv", index_col = 0)

In [7]:
res6.sort_values("score", ascending= False).head(10)

Unnamed: 0,m,model_name,variance_k,normality_k,boruta_flag,vif_k,score,chosen_columns,standardize
9,240.0,xgb,500.0,1.0,1.0,240.0,0.964337,2 14 25 34 58 67 76 101 106 ...,0.0
8,240.0,xgb,500.0,1.0,1.0,240.0,0.963479,2 14 25 34 58 67 76 101 106 ...,1.0
36,170.0,xgb,500.0,1.0,1.0,170.0,0.962646,2 14 34 58 67 76 101 139 174 ...,1.0
37,170.0,xgb,500.0,1.0,1.0,170.0,0.962641,2 14 34 58 67 76 101 139 174 ...,0.0
33,180.0,xgb,500.0,1.0,1.0,180.0,0.962625,2 14 34 58 67 76 101 139 174 ...,0.0
14,225.0,xgb,500.0,1.0,1.0,225.0,0.962533,2 14 34 58 67 76 101 106 139 ...,1.0
32,180.0,xgb,500.0,1.0,1.0,180.0,0.962452,2 14 34 58 67 76 101 139 174 ...,1.0
54,125.0,xgb,500.0,1.0,1.0,125.0,0.961829,14 34 58 67 76 139 174 180 269 ...,1.0
15,225.0,xgb,500.0,1.0,1.0,225.0,0.961708,2 14 34 58 67 76 101 106 139 ...,0.0
20,210.0,xgb,500.0,1.0,1.0,210.0,0.961472,2 14 34 58 67 76 101 139 174 ...,1.0


# Hyperparameter tuning - digits dataset

In [454]:
# grid search:

learning_rates = [0.1, 0.3, 0.5]
n_ests = [30,40,50,60,70]
max_depths = [5,6,7]

ht_cases = np.array(np.meshgrid(learning_rates, n_ests, max_depths)).T.reshape(-1,3)

In [455]:
len(ht_cases)

45

In [386]:
aux = []

splitted_columns = res6.loc[:,"chosen_columns"].map(lambda x: x.split(" "))

for sc in splitted_columns:
    aux = np.append(aux, sc)
    
aux2 = np.unique(aux, return_counts = True)

In [387]:
u, count = aux2

count_sort_ind = np.argsort(-count)

best_columns = u[count_sort_ind][1:]

best_columns

array(['4779', '3051', '58', '14', '67', '180', '364', '4753', '2098',
       '1695', '3034', '2924', '2977', '2808', '3746', '3670', '3845',
       '2764', '493', '1870', '4301', '3550', '757', '3520', '1796',
       '1500', '1793', '2149', '4105', '3196', '2856', '2783', '2781',
       '1032', '3379', '4188', '1523', '4232', '76', '424', '1591', '278',
       '678', '3355', '487', '3097', '1618', '4672', '3725', '1922',
       '3638', '3897', '3774', '649', '2805', '1061', '1027', '558',
       '865', '3609', '269', '445', '2330', '3782', '2835', '1073', '576',
       '2801', '1296', '1716', '4615', '2982', '288', '4464', '1479',
       '1471', '1108', '1580', '3846', '174', '2051', '934', '2485',
       '4572', '2660', '2490', '2077', '3374', '34', '1983', '1654',
       '3847', '4651', '3700', '4315', '616', '2367', '2782', '1099',
       '2557', '4020', '1282', '2926', '4588', '2141', '1934', '139',
       '2048', '4791', '2057', '1377', '4573', '1495', '3650', '315',
       '2945

In [411]:
res6.sort_values("score", ascending = False).head(20)

Unnamed: 0,m,model_name,variance_k,normality_k,boruta_flag,vif_k,score,chosen_columns,standardize
9,240.0,xgb,500.0,1.0,1.0,240.0,0.964337,2 14 25 34 58 67 76 101 106 ...,0.0
8,240.0,xgb,500.0,1.0,1.0,240.0,0.963479,2 14 25 34 58 67 76 101 106 ...,1.0
36,170.0,xgb,500.0,1.0,1.0,170.0,0.962646,2 14 34 58 67 76 101 139 174 ...,1.0
37,170.0,xgb,500.0,1.0,1.0,170.0,0.962641,2 14 34 58 67 76 101 139 174 ...,0.0
33,180.0,xgb,500.0,1.0,1.0,180.0,0.962625,2 14 34 58 67 76 101 139 174 ...,0.0
14,225.0,xgb,500.0,1.0,1.0,225.0,0.962533,2 14 34 58 67 76 101 106 139 ...,1.0
32,180.0,xgb,500.0,1.0,1.0,180.0,0.962452,2 14 34 58 67 76 101 139 174 ...,1.0
54,125.0,xgb,500.0,1.0,1.0,125.0,0.961829,14 34 58 67 76 139 174 180 269 ...,1.0
15,225.0,xgb,500.0,1.0,1.0,225.0,0.961708,2 14 34 58 67 76 101 106 139 ...,0.0
20,210.0,xgb,500.0,1.0,1.0,210.0,0.961472,2 14 34 58 67 76 101 139 174 ...,1.0


In [430]:
case6 = np.unique(np.asarray(res6.sort_values("score", ascending = False).head(20)[["chosen_columns"]].iloc[0,0].replace("   ",",").replace("  ", ",").replace("\n", "").replace(" ", ",")[1:].split(",")).astype(int))
case7 = np.unique(np.asarray(res6.sort_values("score", ascending = False).head(20)[["chosen_columns"]].iloc[2,0].replace("   ",",").replace("  ", ",").replace("\n", "").replace(" ", ",")[1:].split(",")).astype(int))
case8 = np.unique(np.asarray(res6.sort_values("score", ascending = False).head(20)[["chosen_columns"]].iloc[4,0].replace("   ",",").replace("  ", ",").replace("\n", "").replace(" ", ",")[1:].split(",")).astype(int))
case9 = np.unique(np.asarray(res6.sort_values("score", ascending = False).head(20)[["chosen_columns"]].iloc[5,0].replace("   ",",").replace("  ", ",").replace("\n", "").replace(" ", ",")[1:].split(",")).astype(int))
case10 = np.unique(np.asarray(res6.sort_values("score", ascending = False).head(20)[["chosen_columns"]].iloc[7,0].replace("   ",",").replace("  ", ",").replace("\n", "").replace(" ", ",")[1:].split(",")).astype(int))


In [431]:
print(len(case6))
print(len(case7))
print(len(case8))
print(len(case9))
print(len(case10))

240
170
180
225
125


In [457]:
column_cases = {"case1": best_columns[0:50].astype(int),
               "case2": best_columns[0:60].astype(int),
               "case3": best_columns[0:70].astype(int),
               "case4": best_columns[0:80].astype(int),
               "case6": case6,
               "case7": case7,
               "case8": case8,
               "case9": case9,
               "case10": case10}

In [432]:
column_cases["case1"]

array([4779, 3051,   58,   14,   67,  180,  364, 4753, 2098, 1695, 3034,
       2924, 2977, 2808, 3746, 3670, 3845, 2764,  493, 1870, 4301, 3550,
        757, 3520, 1796, 1500, 1793, 2149, 4105, 3196, 2856, 2783, 2781,
       1032, 3379, 4188, 1523, 4232,   76,  424, 1591,  278,  678, 3355,
        487, 3097, 1618, 4672, 3725, 1922])

In [424]:
column_cases["case10"]

array([  14,   34,   58,   67,   76,  139,  174,  180,  269,  278,  288,
        295,  305,  315,  364,  424,  445,  467,  487,  493,  558,  576,
        616,  649,  678,  757,  803,  865,  934, 1027, 1032, 1061, 1073,
       1099, 1108, 1282, 1296, 1377, 1471, 1479, 1495, 1500, 1523, 1580,
       1588, 1591, 1618, 1654, 1695, 1716, 1793, 1796, 1870, 1922, 1934,
       1983, 2048, 2051, 2057, 2068, 2077, 2098, 2141, 2149, 2330, 2367,
       2485, 2490, 2557, 2660, 2764, 2781, 2782, 2783, 2801, 2805, 2808,
       2835, 2856, 2924, 2926, 2945, 2977, 2982, 3034, 3051, 3097, 3144,
       3196, 3355, 3374, 3379, 3468, 3520, 3550, 3609, 3638, 3650, 3670,
       3700, 3725, 3746, 3774, 3782, 3845, 3846, 3847, 3897, 4020, 4105,
       4188, 4232, 4301, 4315, 4464, 4572, 4573, 4588, 4615, 4651, 4672,
       4753, 4779, 4791, 4917])

In [None]:
Xd_train = pd.read_csv("data/digits_train.data", sep = " ", header = None).loc[:, 0:4999]
yd_train = pd.read_csv("data/digits_train.labels", sep = " ", header = None)
Xd_test = pd.read_csv("data/digits_valid.data", sep = " ", header = None).loc[:, 0:4999]

In [458]:
import warnings
warnings.filterwarnings("ignore")

# random.seed(110)

res6_2 = pd.DataFrame({"lr": {},"n_est":{}, "max_depth":{}, "m": {}, "score": {}, "case": {}, "standardize": {}})

scaler1 = StandardScaler()

for case_col, best_cols in column_cases.items():
    print("case: " + case_col)
    
    m = len(best_cols)

    # STANDARDIZING
    
    #     scaler1.fit(data_train)
    #     columns1 = data_train.columns
    #     data_trains = pd.DataFrame(scaler1.transform(data_train))
    #     data_vals = pd.DataFrame(scaler1.transform(data_val))
    #     data_trains.columns = columns1
    #     data_vals.columns = columns1    
    
    # hyperparam tuning
    happy_counter = 0
    for case in ht_cases:
        
        if happy_counter % 10 == 0:
            print("PROGRESS: " + str(100*happy_counter/len(ht_cases)) + " %")
        happy_counter += 1

        scores = []
        scoress = []

        for i in range(10):
            
            data_train, data_val, y1_train, y1_val =  train_test_split(Xd_train, yd_train,train_size = 0.8)

            # pick m best columns
            data_train = data_train.iloc[:, best_cols]
            data_val = data_val.iloc[:, best_cols]
            
            # STANDARDIZING

            scaler1.fit(data_train)
            columns1 = data_train.columns
            data_trains = pd.DataFrame(scaler1.transform(data_train))
            data_vals = pd.DataFrame(scaler1.transform(data_val))
            data_trains.columns = columns1
            data_vals.columns = columns1   

            

            # getting best model from feature selection previous step
            xgboost = GradientBoostingClassifier(learning_rate = case[0], n_estimators = case[1].astype(int), max_depth = case[2].astype(int))
            xgboost.fit(data_train, y1_train)
            predictions = xgboost.predict(data_val)
            score = score2(predictions, y1_val, m)
            scores.append(score)


            xgboost.fit(data_trains, y1_train)
            predictions = xgboost.predict(data_vals)
            score = score2(predictions, y1_val, m)
            scoress.append(score)
            
        res6_2 = res6_2.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": np.mean(scores), "case": case_col, "standardize": False}, ignore_index = True)
        res6_2 = res6_2.append({"lr": case[0], "n_est": case[1].astype(int), "max_depth": case[2].astype(int), "m": m, "score": np.mean(scoress), "case": case_col, "standardize": True}, ignore_index = True)


res6_2.to_csv("results2-digits.csv")


case: case6
PROGRESS: 0.0 %
PROGRESS: 22.22222222222222 %
PROGRESS: 44.44444444444444 %
PROGRESS: 66.66666666666667 %
PROGRESS: 88.88888888888889 %
case: case7
PROGRESS: 0.0 %
PROGRESS: 22.22222222222222 %
PROGRESS: 44.44444444444444 %
PROGRESS: 66.66666666666667 %
PROGRESS: 88.88888888888889 %
case: case8
PROGRESS: 0.0 %
PROGRESS: 22.22222222222222 %
PROGRESS: 44.44444444444444 %
PROGRESS: 66.66666666666667 %
PROGRESS: 88.88888888888889 %
case: case9
PROGRESS: 0.0 %
PROGRESS: 22.22222222222222 %
PROGRESS: 44.44444444444444 %
PROGRESS: 66.66666666666667 %
PROGRESS: 88.88888888888889 %
case: case10
PROGRESS: 0.0 %
PROGRESS: 22.22222222222222 %
PROGRESS: 44.44444444444444 %
PROGRESS: 66.66666666666667 %
PROGRESS: 88.88888888888889 %


In [555]:
res6_2.sort_values("score", ascending = False).head(20)

Unnamed: 0,lr,n_est,max_depth,m,score,case,standardize
564,0.3,60.0,5.0,180.0,0.968909,case8,0.0
577,0.5,70.0,5.0,180.0,0.968569,case8,1.0
576,0.5,70.0,5.0,180.0,0.968042,case8,0.0
475,0.3,60.0,5.0,170.0,0.967635,case7,1.0
565,0.3,60.0,5.0,180.0,0.967593,case8,1.0
472,0.3,50.0,5.0,170.0,0.967586,case7,0.0
574,0.5,60.0,5.0,180.0,0.96747,case8,0.0
622,0.3,50.0,7.0,180.0,0.967338,case8,0.0
626,0.3,70.0,7.0,180.0,0.967255,case8,0.0
575,0.5,60.0,5.0,180.0,0.967223,case8,1.0


# Fitting best model - artificial

In [556]:
np.unique(ya_train,return_counts=True)

(array([-1,  1], dtype=int64), array([1000, 1000], dtype=int64))

In [601]:
best_columns_artificial = [10, 28, 48, 64, 105, 204, 338, 442, 453, 493]

random.seed(12948124)

while(True):
    xgboost = GradientBoostingClassifier(learning_rate = 0.15, n_estimators = 70, max_depth = 7)


    # pick m best columns
    data_train = Xa_train.iloc[:, best_columns_artificial]
    data_val = Xa_test.iloc[:, best_columns_artificial]

    # STANDARDIZING

    scaler1.fit(data_train)
    columns1 = data_train.columns
    data_trains = pd.DataFrame(scaler1.transform(data_train))
    data_vals = pd.DataFrame(scaler1.transform(data_val))
    data_trains.columns = columns1
    data_vals.columns = columns1 


    # getting best model from feature selection previous step
    xgboost.fit(data_trains, ya_train)
    final_pred1 = xgboost.predict(data_vals)
    if(all(np.unique(final_pred1, return_counts=True)[1] == [300,300])):
        break


In [603]:
np.unique(final_pred1, return_counts=True)

(array([-1,  1], dtype=int64), array([300, 300], dtype=int64))

In [602]:
final_pred1 = xgboost.predict(data_vals)
final_pred1_for_class1 = xgboost.predict_proba(data_vals)[:,1]

In [637]:
print(final_pred1_for_class1)
print(len(final_pred1_for_class1))
pd.DataFrame(final_pred1_for_class1).to_csv("PATWRO_artificial_prediction.txt", index=False)

[0.02550807 0.01352801 0.6400402  0.98950637 0.49279684 0.99193103
 0.11369741 0.00662171 0.38739941 0.98273366 0.97322846 0.96906869
 0.07652691 0.97503956 0.87012255 0.00961455 0.06520293 0.99274942
 0.09173147 0.01633236 0.56159898 0.0074684  0.09038846 0.87708801
 0.72365581 0.98818687 0.02245784 0.25943412 0.36996618 0.02203831
 0.04248064 0.9087953  0.85480717 0.14535989 0.99242469 0.03540018
 0.89898216 0.98224404 0.87281689 0.96780082 0.59825735 0.97813108
 0.87081635 0.94505914 0.09359547 0.99407662 0.01329385 0.63792277
 0.13561856 0.28127997 0.9713875  0.27015938 0.43475415 0.95376706
 0.97490824 0.02993033 0.98827186 0.6117387  0.809299   0.01624065
 0.96156504 0.02274247 0.60974324 0.78515598 0.361503   0.07183233
 0.37293392 0.94116034 0.61714622 0.70525897 0.95444175 0.76057486
 0.98288019 0.95801582 0.98882266 0.00690946 0.89502888 0.60113394
 0.98305917 0.1713756  0.08697661 0.95608753 0.69845186 0.9961858
 0.93946804 0.69038634 0.90664056 0.75423446 0.74239346 0.94658

In [639]:
pd.DataFrame(best_columns_artificial).to_csv("PATWRO_artificial_features.txt", index = False)

In [640]:
len(best_columns_artificial)

10

# Fitting best model - digits - did not change anything in predictions

In [460]:
# trying to make some improvements - Isolation forest

from sklearn.ensemble import IsolationForest

las = IsolationForest()
las.fit(Xd_test)





IsolationForest()

In [462]:
las.fit(Xd_test)
if_predictions = las.predict(Xd_test)



np.where(if_predictions == -1)

(array([ 80, 122, 129, 177, 214, 229, 331, 384, 652, 687, 787], dtype=int64),)

In [463]:
indexes = np.where(if_predictions == -1)

In [465]:
las.fit(Xd_train)
if_predictions = las.predict(Xd_train)
indexes_tr = np.where(if_predictions == -1)

In [466]:
indexes

(array([ 80, 122, 129, 177, 214, 229, 331, 384, 652, 687, 787], dtype=int64),)

In [467]:
indexes_tr

(array([   8,   46,   80,  214,  286,  295,  326,  439,  516,  544,  560,
         581,  603,  715,  718,  921,  962, 1259, 1378, 1499, 1579, 1605,
        1659, 1759, 1835, 1979, 2064, 2139, 2198, 2315, 2367, 2580, 2613,
        2692, 2757, 2790, 2845, 2919, 2955, 2966, 3205, 3252, 3357, 3397,
        3418, 3553, 3882, 3938, 3940, 4040, 4044, 4083, 4169, 4204, 4404,
        4487, 4589, 4627, 4803, 4847, 5043, 5215, 5223, 5236, 5274, 5279,
        5400, 5408, 5484, 5531, 5555, 5592, 5773, 5856, 5869, 5890, 5931,
        5952], dtype=int64),)

In [472]:
np.unique(yd_train.iloc[indexes_tr], return_counts = True)

(array([-1,  1], dtype=int64), array([63, 15], dtype=int64))

In [473]:
xgboost = GradientBoostingClassifier(learning_rate = 0.3, n_estimators = 60, max_depth = 5)

cc = column_cases["case8"]

# pick m best columns
data_train = Xd_train.iloc[:, cc]
data_test = Xd_test.iloc[:, cc]


# getting best model from feature selection previous step
xgboost.fit(data_train, yd_train)
final_pred2 = xgboost.predict(data_test)

In [474]:
np.unique(final_pred2[indexes],return_counts = True) # distribution 9:2 seems ok, the sae was in train dataset

(array([-1,  1], dtype=int64), array([9, 2], dtype=int64))

In [475]:
final_proba2 = xgboost.predict_proba(data_test)

In [478]:
final_proba2

array([[1.64523848e-01, 8.35476152e-01],
       [1.77551751e-04, 9.99822448e-01],
       [9.99635161e-01, 3.64838560e-04],
       ...,
       [3.25058829e-03, 9.96749412e-01],
       [9.99294331e-01, 7.05669456e-04],
       [9.99873286e-01, 1.26714048e-04]])

In [479]:
final_pred2

array([ 1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1,  1,
        1,  1,  1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1, -1,
       -1, -1, -1,  1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,
        1,  1, -1, -1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,
        1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,
        1,  1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1,
        1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
        1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1,  1, -1,  1,
        1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1,  1,
        1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,
        1,  1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1,  1,
       -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,
        1, -1,  1, -1, -1

In [480]:
final_pred2_for_class1 = final_proba2[:,1]

In [481]:
final_pred2_for_class1

array([8.35476152e-01, 9.99822448e-01, 3.64838560e-04, 9.98906915e-01,
       9.99752767e-01, 1.33785299e-04, 1.86762268e-04, 9.92905630e-05,
       3.12824418e-04, 4.25349261e-04, 9.92104818e-01, 9.97865414e-01,
       5.26128589e-03, 9.99913060e-01, 1.66168012e-03, 9.97646707e-01,
       9.99664162e-01, 9.99679394e-01, 9.72714327e-01, 9.99473807e-01,
       2.24588827e-03, 9.99211666e-01, 9.98461977e-01, 9.99643817e-01,
       9.97227692e-01, 7.11015498e-03, 9.99826743e-01, 9.99663178e-01,
       1.56720961e-02, 9.90996065e-01, 9.85119818e-01, 9.31977655e-01,
       1.25335216e-02, 9.99842750e-01, 9.55549589e-01, 9.99845725e-01,
       6.02265703e-01, 9.91710827e-01, 1.83752447e-02, 4.68202699e-04,
       9.99730023e-01, 6.62749444e-04, 9.82229934e-01, 9.99657717e-01,
       4.59192173e-04, 1.62844892e-04, 9.83932568e-01, 2.62438988e-02,
       9.68891741e-01, 1.09502117e-03, 5.52819533e-04, 5.78399520e-04,
       1.75258609e-04, 3.25616256e-04, 9.99065179e-01, 9.99929365e-01,
      

Trying to assign -1 predictions to observations where probability ~0.5 so that there will be a balance between predicted classes. Occurences of both will be 300. It could be done with consecutively training models and counting the classes' occurences just as in case of artificial data.

In [635]:
predd = xgboost.predict_proba(data_val)[:,1]

In [492]:
np.where(abs(predd-0.5) <= 0.25)

(array([2567, 2724, 5903], dtype=int64),)

In [636]:
np.unique(yd_train.iloc[np.where(abs(predd-0.5) <= 0.28)],return_counts = True)

(array([-1,  1], dtype=int64), array([297, 303], dtype=int64))

In [641]:
np.unique(final_pred2[np.where(abs(final_pred2_for_class1-0.5) <= 0.028)],return_counts = True)

(array([], dtype=int64), array([], dtype=int64))

In [None]:
yd_train

In [518]:
np.unique(xgboost.predict(data_train), return_counts = True)

(array([-1,  1], dtype=int64), array([3000, 3000], dtype=int64))

In [540]:
np.unique(xgboost.predict(data_test), return_counts = True)

(array([-1,  1], dtype=int64), array([498, 502], dtype=int64))

In [541]:
final_pred2_for_class1[np.where(abs(final_pred2_for_class1-0.5) <= 0.028)] = [0.4, 0.41]

array([], dtype=float64)

In [542]:
final_pred2_for_class1

array([8.35476152e-01, 9.99822448e-01, 3.64838560e-04, 9.98906915e-01,
       9.99752767e-01, 1.33785299e-04, 1.86762268e-04, 9.92905630e-05,
       3.12824418e-04, 4.25349261e-04, 9.92104818e-01, 9.97865414e-01,
       5.26128589e-03, 9.99913060e-01, 1.66168012e-03, 9.97646707e-01,
       9.99664162e-01, 9.99679394e-01, 9.72714327e-01, 9.99473807e-01,
       2.24588827e-03, 9.99211666e-01, 9.98461977e-01, 9.99643817e-01,
       9.97227692e-01, 7.11015498e-03, 9.99826743e-01, 9.99663178e-01,
       1.56720961e-02, 9.90996065e-01, 9.85119818e-01, 9.31977655e-01,
       1.25335216e-02, 9.99842750e-01, 9.55549589e-01, 9.99845725e-01,
       6.02265703e-01, 9.91710827e-01, 1.83752447e-02, 4.68202699e-04,
       9.99730023e-01, 6.62749444e-04, 9.82229934e-01, 9.99657717e-01,
       4.59192173e-04, 1.62844892e-04, 9.83932568e-01, 2.62438988e-02,
       9.68891741e-01, 1.09502117e-03, 5.52819533e-04, 5.78399520e-04,
       1.75258609e-04, 3.25616256e-04, 9.99065179e-01, 9.99929365e-01,
      

In [547]:
len(final_pred2_for_class1)

1000

In [549]:
pd.DataFrame(final_pred2_for_class1).to_csv("PATWRO_digits_prediction.txt", index=False)

In [551]:
cc = column_cases["case8"]

In [552]:
cc_df = pd.DataFrame(cc)

In [553]:
cc_df

Unnamed: 0,0
0,2
1,14
2,34
3,58
4,67
...,...
175,4779
176,4791
177,4885
178,4917


In [554]:
cc_df.to_csv("PATWRO_digits_features.txt", index=False)