# Build a RandomSearch with the CV-Pipe from scratch using parallel processing

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
import optuna
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from copy import deepcopy
from sklearn.metrics import confusion_matrix
import seaborn as sns
from tqdm.notebook import tqdm
import concurrent

In [2]:
folder = '/home/olli/Projects/Kaggle/ICR'

In [3]:
train_csv = 'train.csv'
greek_csv = 'greeks.csv'
test_csv = 'test.csv'

In [4]:
train_csv_path = os.path.join(folder, train_csv)
greek_csv_path = os.path.join(folder, greek_csv)

In [5]:
df = pd.read_csv(train_csv_path)
df_g = pd.read_csv(greek_csv_path)

# Preprocess Pipeline

In [6]:
cat_features = ['EJ']
num_features = list(df.columns)
for remove_value in ['Id', 'EJ', 'Class']:
    num_features.remove(remove_value)

In [7]:
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

In [8]:
# use a standardscaler due to the outliers
scaler_num = StandardScaler()

In [9]:
encoder_cat = OneHotEncoder(sparse_output=False)

In [10]:
num_pipeline = Pipeline([
    ('Num_Imputer', imputer_num),
    ('Num_Scaler', scaler_num)
])

In [11]:
cat_pipeline = Pipeline([
    ('Cat_Imputer', imputer_cat),
    ('Cat_Encoder', encoder_cat)
])

In [12]:
preprocess_pipe = ColumnTransformer([
    ('Num_Pipe', num_pipeline, num_features),
    ('Cat_Pipe', cat_pipeline, cat_features)
])

In [13]:
# merge dfs
X_DF = df.copy()
X_DF['Alpha'] = df_g.Alpha
X_DF['Gamma'] = df_g.Gamma

In [14]:
# metric
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)  # [num_class_0, num_class_1]
    return log_loss(y_true, y_pred, sample_weight=1/nc[y_true], eps=1e-15)

In [15]:
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Single Funtion for a set of hyperparameters to parallelize

In [16]:
# calculates the score for a single set of random parameters

def single_score(C, kernel, gamma, degree, coef, weight_1):

    scores = []

    seeds = list(range(0, 10))

    for seed in seeds:

        cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

        # use alpha+gamma to draw stratified samples
        for train_index, valid_index in cv.split(X_DF, X_DF[['Alpha', 'Gamma']].copy()):

            # take all columns for the features since the ColumnTransformer will only select the defined ones
            X_train, y_train = X_DF.iloc[train_index], X_DF.loc[train_index, 'Class']
            X_valid, y_valid = X_DF.iloc[valid_index], X_DF.loc[valid_index, 'Class']

            # use defined pipeline
            X_train = preprocess_pipe.fit_transform(X_train)
            X_valid = preprocess_pipe.transform(X_valid)  # no fit

            y_train = np.array(y_train)
            y_valid = np.array(y_valid)

            svm = SVC(C=C,
                      kernel=kernel,
                      gamma=gamma,
                      degree=degree,
                      coef0=coef,
                      probability=True,
                      class_weight={0: 1, 1: weight_1})
                      

            svm.fit(X_train, y_train)

            y_val_pred = svm.predict_proba(X_valid)

            score = balanced_log_loss(y_valid, y_val_pred)
            scores.append(score)

    final_score = np.array(scores).mean()
    
    return final_score, (C, kernel, gamma, degree, coef, weight_1)

# Define the parameters to try

In [17]:
C_values = np.arange(0.1, 5, 0.1)
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
gamma_values = np.arange(0.1, 10, 0.1)
degree_values = [1, 2, 3, 4, 5]
coef_values = np.arange(-1, 1, 0.01)
weight_1_values = np.arange(4, 10, 0.1)

# Define the RandomSearch

In [18]:
# run defined number of random trials

def randomSearch(tries=100,
                 kernel_values=kernel_values,
                 C_values=C_values,
                 degree_values=degree_values,
                 gamma_values=gamma_values,
                 coef_values=coef_values, 
                 weight_1_values=weight_1_values):
    
    try_parameter = []  # append parameter tuples here
    
    # for the defined number of trials first get random parameter combinations to execute in parallel
    for _ in range(tries):
        c = C_values[np.random.randint(len(C_values))]
        k = kernel_values[np.random.randint(len(kernel_values))]
        g = gamma_values[np.random.randint(len(gamma_values))]
        d = degree_values[np.random.randint(len(degree_values))]
        co = coef_values[np.random.randint(len(coef_values))]
        w = weight_1_values[np.random.randint(len(weight_1_values))]
        
        # add the random parameters for a single run
        try_parameter.append((c, k, g, d, co, w))
    
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(single_score, *zip(*try_parameter)), total=len(try_parameter)))
        
    best_score = 999
    best_index = None
    
    for index, result in enumerate(results):
        if result[0] < best_score:
            best_score = result[0]
            best_index = index
    
    print(f'Best Score: {best_score} at index {best_index}')
    
    return results

In [19]:
try_1_results = randomSearch(tries=1000)

  0%|          | 0/1000 [00:00<?, ?it/s]

Best Score: 0.5777533108849062 at index 272


1000 * 10 * 5 * 5 = 250.000 tries in ~7 min with only cpu is not bad

In [20]:
try_1_results[272]

(0.5777533108849062, (4.5, 'rbf', 0.1, 3, -0.94, 6.999999999999989))

# Round 2

In [21]:
C_values_2 = np.arange(0.1, 30, 0.1)
kernel_values_2 = ['rbf']
gamma_values_2 = np.arange(0.01, 1, 0.01)
degree_values_2 = [1]  # now useless 
coef_values_2 = np.arange(-1, -0.5, 0.01)
weight_1_values_2 = np.arange(6, 9, 0.1)

In [23]:
try_2_results = randomSearch(tries=1000, kernel_values=kernel_values_2, C_values=C_values_2, \
        gamma_values=gamma_values_2, degree_values=degree_values_2, coef_values=coef_values_2, \
        weight_1_values=weight_1_values_2)

  0%|          | 0/1000 [00:00<?, ?it/s]

Best Score: 0.455895150196935 at index 886


In [25]:
try_2_results[886]

(0.455895150196935,
 (5.4, 'rbf', 0.02, 1, -0.8099999999999998, 7.299999999999995))

# Round 3

In [26]:
C_values_3 = np.arange(3, 7, 0.1)
kernel_values_3 = ['rbf']
gamma_values_3 = np.arange(0.001, 0.04, 0.001)
degree_values_3 = [1]  # now useless 
coef_values_3 = np.arange(-0.9, -0.7, 0.01)
weight_1_values_3 = np.arange(6.8, 7.8, 0.05)

In [27]:
try_3_results = randomSearch(tries=1000, kernel_values=kernel_values_3, C_values=C_values_3, \
        gamma_values=gamma_values_3, degree_values=degree_values_3, coef_values=coef_values_3, \
        weight_1_values=weight_1_values_3)

  0%|          | 0/1000 [00:00<?, ?it/s]

Best Score: 0.45122001210498125 at index 944


In [29]:
try_3_results[944]

(0.45122001210498125,
 (5.000000000000002, 'rbf', 0.017, 1, -0.7399999999999999, 7.599999999999997))