In [1]:
import sys
sys.path.append('../')
from packages.hpo import bo_tpe, pso, smac
from packages.train import train_baseline, train_optimize
from packages.utils import create_report, read_mapped_dataset

Read datasets
- QC dataset for training 
- MLCQ dataset for testing/predicting

In [2]:
train_data = read_mapped_dataset('QC')
test_data = read_mapped_dataset('MLCQ')

Loading Mapped QC dataset . . .
Done all mapped QC datasets are loaded !
Loading Mapped MLCQ dataset . . .
Done all mapped MLCQ datasets are loaded !


In [3]:
print(f'== QC info === \n{train_data.keys()}\n-------------')
for smell in train_data:
    print(smell, "shape:",train_data[smell].shape)
    print(f'\n{train_data[smell]["is_smell"].value_counts()}\n-------------')

== QC info === 
dict_keys(['Data_Class', 'Feature_Envy', 'God_Class', 'Long_Method'])
-------------
Data_Class shape: (420, 7)

False    280
True     140
Name: is_smell, dtype: int64
-------------
Feature_Envy shape: (420, 4)

False    280
True     140
Name: is_smell, dtype: int64
-------------
God_Class shape: (420, 7)

False    280
True     140
Name: is_smell, dtype: int64
-------------
Long_Method shape: (420, 4)

False    280
True     140
Name: is_smell, dtype: int64
-------------


In [4]:
print(f'\n=== MLCQ info === \n {test_data.keys()}')
for smell in test_data:
    print(smell, "shape: ", test_data[smell].shape)
    print(f'\n{test_data[smell]["is_smell"].value_counts()}\n-------------')


=== MLCQ info === 
 dict_keys(['Data_Class', 'Feature_Envy', 'God_Class', 'Long_Method'])
Data_Class shape:  (2154, 7)

False    1870
True      284
Name: is_smell, dtype: int64
-------------
Feature_Envy shape:  (2035, 4)

False    1971
True       64
Name: is_smell, dtype: int64
-------------
God_Class shape:  (2122, 7)

False    1889
True      233
Name: is_smell, dtype: int64
-------------
Long_Method shape:  (2080, 4)

False    1834
True      246
Name: is_smell, dtype: int64
-------------


Experiment setup

In [5]:
num_folds = 10
n_repeats = 10
max_evals = 10
iters = 10

In [6]:
for i in range(iters):
    
    model_dir = "/transfer/{}".format(i)

    baseline_models = {}
    bo_models = {}
    pso_models = {}
    smac_models = {}

    bo_best = {}
    pso_best = {}
    smac_best = {}

    bo_compute_time = {}
    pso_compute_time = {}
    smac_compute_time = {}

    for smell in train_data:
        X_train = train_data[smell].drop(['is_smell'], axis=1)
        y_train = train_data[smell]['is_smell']

        X_test = test_data[smell].drop(['is_smell'], axis=1)
        y_test = test_data[smell]['is_smell']

        print("-------------------------- Code smell : {} --------------------------".format(smell))

        # create baseline
        baseline_models[smell] = train_baseline(X_train, y_train, smell=smell, n_splits=num_folds, n_repeats=n_repeats, model_dir=model_dir)
        
        # optimize
        bo_best[smell], bo_compute_time[smell] = bo_tpe(X_train, y_train, max_evals=max_evals)
        pso_best[smell], pso_compute_time[smell] = pso(X_train, y_train, max_evals=max_evals)
        smac_best[smell], smac_compute_time[smell] = smac(X_train, y_train, max_evals=max_evals)
        
        # crete optimized models
        bo_models[smell] = train_optimize(X_train, y_train, conf=bo_best[smell], smell=smell, hpo="bo", n_splits=num_folds, n_repeats=n_repeats, model_dir=model_dir)
        pso_models[smell] = train_optimize(X_train, y_train, conf=pso_best[smell], smell=smell, hpo="pso", n_splits=num_folds, n_repeats=n_repeats, model_dir=model_dir)
        smac_models[smell] = train_optimize(X_train, y_train, conf=smac_best[smell], smell=smell, hpo="smac", n_splits=num_folds, n_repeats=n_repeats, model_dir=model_dir)

        
        #  predict =
        for model in range(2):
            baseline_models[smell][model].set_prediction_score(X_test, y_test)
            bo_models[smell][model].set_prediction_score(X_test, y_test)
            pso_models[smell][model].set_prediction_score(X_test, y_test)
            smac_models[smell][model].set_prediction_score(X_test, y_test)
    
    create_report(baseline_models, report_name="/transfer/baseline_{}.csv".format(i))
    create_report(bo_models, bo_compute_time, report_name="/transfer/bo_{}.csv".format(i))
    create_report(pso_models, pso_compute_time, report_name="/transfer/pso_{}.csv".format(i))
    create_report(smac_models, smac_compute_time, report_name="/transfer/smac_{}.csv".format(i))

-------------------------- Code smell : Data_Class --------------------------
creating baseline . . .
Model: /baseline_Data_Class_decision_tree.pkl exported !
Model: /baseline_Data_Class_random_forest.pkl exported !
Done ! 
100%|██████████| 10/10 [00:00<00:00, 33.12trial/s, best loss: -0.8357142857142857]
100%|██████████| 10/10 [00:06<00:00,  1.63trial/s, best loss: -0.8857142857142858]
Done !

Optmize using PSO . . .  Done !

Optmize using SMAC . . .  Done !
creating model using hyper-params from BO . . .
Model: /bo_Data_Class_decision_tree.pkl exported !
Model: /bo_Data_Class_random_forest.pkl exported !
Done ! 
creating model using hyper-params from PSO . . .
Model: /pso_Data_Class_decision_tree.pkl exported !
Model: /pso_Data_Class_random_forest.pkl exported !
Done ! 
creating model using hyper-params from SMAC . . .
Model: /smac_Data_Class_decision_tree.pkl exported !
Model: /smac_Data_Class_random_forest.pkl exported !
Done ! 
-------------------------- Code smell : Feature_Envy 