In [1]:
%load_ext autoreload
%autoreload 2

# get project dir
# import standard libs
from IPython.display import display
from IPython.core.debugger import set_trace as bp
from pathlib import PurePath, Path
import sys
import time
from collections import OrderedDict as od
import re
import os
import json

pp = PurePath(Path.cwd()).parts[:]
pdir = PurePath(*pp)
data_script_dir = pdir / 'src' / 'data'
bars_script_dir = pdir / 'src' / 'features'
sys.path.append(data_script_dir.as_posix())
sys.path.append(bars_script_dir.as_posix())
viz_dir = pdir / 'reports' / 'figures'
data_dir = pdir / 'data'

# import python scientific stack
import pandas as pd
pd.set_option('display.max_rows', 100)
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
from multiprocessing import cpu_count
pbar = ProgressBar()
pbar.register()
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from numba import jit
import math

# import visual tools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns

plt.style.use('seaborn-talk')
plt.style.use('bmh')
#plt.rcParams['font.family'] = 'DejaVu Sans Mono'
plt.rcParams['font.size'] = 9.5
plt.rcParams['font.weight'] = 'medium'
plt.rcParams['figure.figsize'] = 10,7
blue, green, red, purple, gold, teal = sns.color_palette('colorblind', 6)

# import util libs
# from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
from utils import *
from bars import *
from labelling import *
from mpEngine import *
from sampleWeights import *
from ffd import *
from cvFin import *
from featureImportance import *
RANDOM_STATE = 777


In [2]:
trnsX, cont = getTestData(n_features = 10, n_informative = 5, n_redundant = 0, n_samples = 10000)

In [3]:
from hyperParaTuning import *
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

param_grid = {'svc__C': [1e-2, 1e-1, 1, 10, 100],
              'svc__gamma': [1e-2, 1e-1, 1, 10, 100]}
scoring_func = 'accuracy'

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state = RANDOM_STATE))


In [4]:
gs = GridSearchCV(estimator = pipe_svc,
                  param_grid = param_grid,
                  scoring = 'f1',
                  cv = 10,
                  fit_params = {'svc__sample_weight': cont['w']},
                  n_jobs = -1)

In [5]:
gs = gs.fit(trnsX, cont[['bin']])



In [6]:
gs.best_score_

0.861047296460995

In [7]:
gs.best_params_

{'svc__C': 100, 'svc__gamma': 0.1}

In [8]:
param_grid_rand = {'svc__C': logUniform(a = 1e-2, b= 1e2),
                   'svc__gamma': logUniform(a = 1e-2, b = 1e2)}

gs_rnd = RandomizedSearchCV(estimator = pipe_svc, 
                        param_distributions = param_grid_rand, 
                        scoring = scoring_func, cv = 10, n_jobs = -1, 
                        iid = False, n_iter = 25)

In [9]:
gs_rnd = gs_rnd.fit(trnsX, cont[['bin']])

In [10]:
gs_rnd.best_params_

{'svc__C': 12.185551272281483, 'svc__gamma': 0.037441092198679506}

In [11]:
gs_rnd.best_score_

0.8924787551787553

In [12]:
clf_rnd_best = gs_rnd.best_estimator_
clf_rnd_best = clf_rnd_best.fit(trnsX, cont[['bin']])

In [13]:
rndy = clf_rnd_best.predict(trnsX)

In [14]:
(rndy - cont['bin']).mean()/(rndy - cont['bin']).std()

0.12525650134716312

In [15]:
clf_best = gs.best_estimator_
clf_best = clf_best.fit(trnsX, cont[['bin']])
yhat = clf_best.predict(trnsX)
(yhat - cont['bin']).mean()/(yhat - cont['bin']).std()

0.07633705355138667