In [1]:
import sys
sys.path.append('../')
from packages.hpo import bo_tpe, pso, smac, export_hpo_compute_time
from packages.train import train_baseline, train_optimize
from packages.utils import read_QC_train_test, export_report

### Load Qualitas Corpus dataset

In [2]:
train_data = read_QC_train_test('train')
test_data = read_QC_train_test('test')
print(train_data.keys())

All QC training datasets are loaded !
All QC testing datasets are loaded !
dict_keys(['DATA_CLASS', 'FEATURE_ENVY', 'GOD_CLASS', 'LONG_METHOD'])


In [3]:
print(train_data['DATA_CLASS'].info())
print('\n===========\n Dataset shape: ',train_data['DATA_CLASS'].shape,'\n===========\n')
train_data['DATA_CLASS'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 62 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   NOII_type                                         262 non-null    int64  
 1   NOAM_type                                         262 non-null    int64  
 2   NOCS_type                                         262 non-null    int64  
 3   NOM_type                                          262 non-null    int64  
 4   NMO_type                                          262 non-null    int64  
 5   ATFD_type                                         262 non-null    int64  
 6   FANOUT_type                                       262 non-null    int64  
 7   NOMNAMM_type                                      262 non-null    int64  
 8   NOA_type                                          262 non-null    int64  
 9   NIM_type             

Unnamed: 0,NOII_type,NOAM_type,NOCS_type,NOM_type,NMO_type,ATFD_type,FANOUT_type,NOMNAMM_type,NOA_type,NIM_type,...,number_not_abstract_not_final_methods,number_static_methods,number_final_static_methods,number_final_not_static_methods,number_not_final_static_methods,number_not_final_not_static_methods,number_standard_design_methods,number_constructor_DefaultConstructor_methods,number_constructor_NotDefaultConstructor_methods,is_smell
0,0,5,0,7,0,0,0,2,4,15,...,7,0,0,0,0,7,0,0,2,True
1,0,0,0,7,1,1,3,7,3,25,...,7,0,0,0,0,7,7,0,0,False
2,0,0,0,1,0,0,1,1,0,0,...,1,1,0,0,1,0,1,0,0,False
3,0,0,0,2,1,0,0,2,2,0,...,3,0,0,0,0,3,2,1,0,False
4,0,0,0,5,0,0,2,5,0,45,...,5,0,0,0,0,5,4,0,1,False


### Train and optimize

In [4]:
SEED = 42
num_folds = 10
n_repeats = 10

max_evals = 50
iters = 10

In [5]:

for i in range(iters):
    baseline_models = {}
    bo_models = {}
    pso_models = {}
    smac_models = {}

    bo_best = {}
    pso_best = {}
    smac_best = {}

    bo_ct = {}
    pso_ct = {}
    smac_ct = {}

    for smell in train_data:
        
        X_train = train_data[smell].drop(train_data[smell].columns[-1], axis=1)
        y_train = train_data[smell][train_data[smell].columns[-1]]

        X_test = test_data[smell].drop(test_data[smell].columns[-1], axis=1)
        y_test = test_data[smell][test_data[smell].columns[-1]]
        
        print("-------------------------- Code smell : {} --------------------------".format(smell))

        # create baseline
        baseline_models[smell] = train_baseline(X_train, y_train, smell=smell, n_splits=num_folds, n_repeats=n_repeats)

        # optimize
        bo_best[smell], bo_ct[smell] = bo_tpe(X_train, y_train, max_evals=max_evals)
        pso_best[smell], pso_ct[smell] = pso(X_train, y_train, max_evals=max_evals)
        smac_best[smell], smac_ct[smell] = smac(X_train, y_train, max_evals=max_evals)
        
        # crete optimized models
        bo_models[smell] = train_optimize(X_train, y_train, conf=bo_best[smell], smell=smell, hpo="bo", n_splits=num_folds, n_repeats=n_repeats)
        pso_models[smell] = train_optimize(X_train, y_train, conf=pso_best[smell], smell=smell, hpo="pso", n_splits=num_folds, n_repeats=n_repeats)
        smac_models[smell] = train_optimize(X_train, y_train, conf=smac_best[smell], smell=smell, hpo="smac", n_splits=num_folds, n_repeats=n_repeats)

        # predict 
        for model in range(2):
            baseline_models[smell][model].set_prediction_score(X_test, y_test)
            bo_models[smell][model].set_prediction_score(X_test, y_test)
            pso_models[smell][model].set_prediction_score(X_test, y_test)
            smac_models[smell][model].set_prediction_score(X_test, y_test)
    
    
    # export reports
    runing_round = i
    hpo_compute_time_path = '../reports/qc/hpo_computational_time_{}.csv'.format(runing_round)
    export_hpo_compute_time(bo_ct, pso_ct, smac_ct, files_path=hpo_compute_time_path)
    
    export_report(baseline_models, '/qc/baseline_{}.csv'.format(runing_round))
    export_report(bo_models, "/qc/bo_{}.csv".format(runing_round))
    export_report(pso_models, "/qc/pso_{}.csv".format(runing_round))
    export_report(smac_models, "/qc/smac_{}.csv".format(runing_round))

-------------------------- Code smell : DATA_CLASS --------------------------
creating baseline . . .
Model: baseline_DATA_CLASS_decision_tree.pkl exported !
Model: baseline_DATA_CLASS_random_forest.pkl exported !
Done ! 
100%|██████████| 50/50 [00:02<00:00, 17.96trial/s, best loss: -1.0]               
100%|██████████| 50/50 [00:32<00:00,  1.52trial/s, best loss: -1.0]              
Done !

Optmize using PSO . . .  Done !

Optmize using SMAC . . .  Done !
creating model using hyper-params from BO . . .
Model: bo_DATA_CLASS_decision_tree.pkl exported !
Model: bo_DATA_CLASS_random_forest.pkl exported !
Done ! 
creating model using hyper-params from PSO . . .
Model: pso_DATA_CLASS_decision_tree.pkl exported !
Model: pso_DATA_CLASS_random_forest.pkl exported !
Done ! 
creating model using hyper-params from SMAC . . .
Model: smac_DATA_CLASS_decision_tree.pkl exported !
Model: smac_DATA_CLASS_random_forest.pkl exported !
Done ! 
-------------------------- Code smell : FEATURE_ENVY ---------