# Initialization

In [None]:
%load_ext autoreload
%autoreload 2
import sys, warnings, time, numpy, yaml, pandas, logging, random
from pathlib import Path
sys.path.append("../src/") # go to parent dir
from data_access import get_X, get_y, get_train_test
from models.factory import ModelFactory
warnings.filterwarnings('ignore')
with open('../confs/logs.yaml', 'rt') as f:
    config = yaml.safe_load(f.read())
logging.config.dictConfig(config)

In [None]:
type='mix'
X_train, y_train, X_test, y_test, target = get_train_test(train_size=0.8, random_state=42, type=type)
train_scores = get_y('train', type)
test_data = get_X('test', type)

In [None]:
def eval_model(model, save_model=False, X=X_test):
    start = time.time()
    model.fit()
    score = model.evaluate(X)
    end = time.time()
    logging.debug(f'{model.name}={score} in {numpy.round((end-start), 2)}s')
    if save_model:
        model.save(test_data)
    return {'name': model.name, 'score': score, 'time': numpy.round((end-start), 2)}

In [None]:
%load_ext autoreload
%autoreload 2
def eval_model_for_name(name, X=X_train, y=y_train):
    with open('../confs/models.yaml', 'r') as file:
        configurations = yaml.safe_load(file)
    factory = ModelFactory(configurations, X, y, train_scores)
    model = factory.get_model(name)
    return eval_model(model)

In [None]:
def get_features(name = 'ada_boost'):
    with open('../confs/models.yaml', 'r') as file:
        configurations = yaml.safe_load(file)
    factory = ModelFactory(configurations, X_train, y_train, train_scores)
    model = factory.get_model(name)
    eval_model(model)
    df_importances = model.get_feature_importances()
    return list(df_importances['feature'])

In [None]:
default_features=get_features('ada_boost')

In [None]:
def test_features(name = 'xgb_gblinear', save=True, features=default_features):
    with open('../confs/models.yaml', 'r') as file:
        configurations = yaml.safe_load(file)
    if features is None:
        features=default_features
    lines = []
    best_score = 0
    selected_features = []
    for i in range(1, len(features)):
        sub_features = selected_features.copy()
        feature = features[i-1]
        sub_features.append(feature)
        # sub_features = features[:i]
        factory = ModelFactory(configurations, X_train[sub_features], y_train, train_scores)
        model = factory.get_model(name)
        model.name = f'{name}_{i}'
        line = eval_model(model,X=X_test[sub_features])
        line['nbr_features'] = len(sub_features)
        line['model'] = name
        line['feature'] = feature
        line['features'] = sub_features
        lines.append(line)
        if best_score < line['score']:
            selected_features.append(line['feature'])
            best_score = line['score']
    df = pandas.DataFrame(lines)
    if save:
        df.to_csv(f'../data/output/{type}/features/{name}.csv')
    logging.info(f'Features selected for score {name} - {best_score}: {len(selected_features)}, {selected_features}')
    return df

In [None]:
def plot_features(folder = '../data/output/{type}/features/'):
   import os, pandas
   df = None
   for filename in os.listdir(folder):
      # if not os.path.isfile(filename): continue
      model = filename.split('.')[0]
      df_tmp = pandas.read_csv(f'{folder}{filename}')
      df_tmp[model] = df_tmp['score']
      if df is None:
         df = df_tmp[[model]]
      else:
         df[model] = df_tmp[[model]]
   df.plot.line()

# Feature importances

In [None]:
name = 'lightgbm'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
name = 'ada_boost'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
name = 'xgb_gblinear'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
name = 'gradient_boosting'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
name = 'hist_gradient_boosting'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
name = 'xgb_classifier'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
name = 'catboost'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
name = 'mlp'
features=None
df = test_features(name, features=features)
df[['score']].plot.line()

In [None]:
plot_features(f'../data/output/{type}/features/')

In [None]:
plot_features(f'../data/output/{type}/all_features/')