## Preparing algorithm setup


In [51]:
import pandas as pd
from Annealer import Annealer
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

PATH = "../data_processed/breast-cancer-diagnostic.shuf.lrn.csv"
feature_structure_file =  open('../data_processed/breast-cancer_column_types.pkl', 'rb')
feature_structure = pickle.load(feature_structure_file)

# loading the data from the csv files
def load_data(dataset_path, column_types_path):
    data = pd.read_csv(dataset_path)
    with open(column_types_path, 'rb') as feature_file:
        feature_structure = pickle.load(feature_file)
    return data, feature_structure

data = pd.read_csv(PATH)

s = SVC()

param_grid1 = {'coef0': [0.0, 0.5, 1.0, 2.0, 2.5, 3.0],
               'degree': [2, 3, 4, 5,6],
               'gamma': ['scale', 'auto'],
               'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
               'C': [0.1, 1, 10, 100, 200]}

               

param_grid2 ={
'n_estimators': [3,5,10,20,30,50,75,100,150,200],
'criterion': ["gini", "entropy", "log_loss"],
'max_depth': [None, 1,2,3,4,5,7,10,15,20,50],
'min_samples_split': [2,3,4,5,7,10,15,20],
'min_samples_leaf': [1,2,3,4,5,7,10],
'max_features': [None, 'sqrt', 'log2'],
'max_leaf_nodes': [None, 10,20,30,40,50,100], 
'min_impurity_decrease': [0.0, 0.01, 0.02, 0.03, 0.05, 0.1],
'class_weight': [None, 'balanced']
}

datasets = [
    # {
    #     "dataset_path": "./data/breast_cancer/breast-cancer-diagnostic.shuf.lrn.csv",
    #     "column_types_path": "./data/breast_cancer/breast-cancer_column_types.pkl",
    # },
    {
        "dataset_path": "./data/alzheimer/alzheimers_prediction_dataset.csv",
        "column_types_path": "./data/alzheimer/alzheimer_dataset.pkl",
    },
    # {
    #     "dataset_path": "./data/placement/placementdata.csv",
    #     "column_types_path": "./data/placement/placement_metadata.pkl",
    # },
    {
        "dataset_path": "./data/congress_voting/CongressionalVotingID.shuf.lrn.csv",
        "column_types_path": "./data/congress_voting/congressional_voting.pkl",
    }
]

alzheimer_data, alzheimer_feature_structure = load_data(datasets[0]["dataset_path"], datasets[0]["column_types_path"])
congress_data, congress_feature_structure = load_data(datasets[1]["dataset_path"], datasets[1]["column_types_path"])

In [22]:
from pathlib import Path
p1 = Path("./data/congress_voting/congressional-voting.pkl")
p1.exists()

True

In [2]:
alzheimer_features = alzheimer_feature_structure['bin'] + alzheimer_feature_structure['cat'] +alzheimer_feature_structure['cont'] + alzheimer_feature_structure['ord']
data = (alzheimer_data[alzheimer_features].reset_index(drop=True), alzheimer_data[alzheimer_feature_structure['target']].reset_index(drop=True))
data

(       Gender Family History of Alzheimers  Education Level Employment Status  \
 0        Male                           No                1           Retired   
 1        Male                           No                7        Unemployed   
 2      Female                           No               19          Employed   
 3        Male                           No               17           Retired   
 4      Female                           No                3          Employed   
 ...       ...                          ...              ...               ...   
 74278  Female                           No                3        Unemployed   
 74279    Male                           No               18        Unemployed   
 74280  Female                          Yes               13          Employed   
 74281  Female                           No                7          Employed   
 74282  Female                           No                1        Unemployed   
 
       Marital

In [3]:
Annealer_RF = Annealer(RandomForestClassifier(),
                        feature_structure=alzheimer_feature_structure,
                        search_space=param_grid2,
                        max_iter=100,
                        data=data)

In [10]:
Annealer_RF.preprocessor_step.fit(data[0])

In [9]:
data0Annealer_RF.preprocessor_step.fit(data[0])


In [11]:
Annealer_RF.preprocessor_step.transform(data[0])

array([[0., 1., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]])

In [12]:
Annealer_RF.simulation_annealing_fast(1, 10e-03)

Start SA with 100 iterations
Finished 101 SA iterations with best_score=0.7206493177881154


({'n_estimators': 150,
  'criterion': 'gini',
  'max_depth': 3,
  'min_samples_split': 7,
  'min_samples_leaf': 3,
  'max_features': None,
  'max_leaf_nodes': 100,
  'min_impurity_decrease': 0.01,
  'class_weight': 'balanced'},
 np.float64(0.7206493177881154),
 317.0343408584595)

In [52]:
congress_features = congress_feature_structure['bin'] + congress_feature_structure['cat'] +congress_feature_structure['cont'] + congress_feature_structure['ord']

data1 = (congress_data[congress_features].reset_index(drop=True), congress_data[congress_feature_structure['target']].reset_index(drop=True))

Annealer_RF2 = Annealer(RandomForestClassifier(),
                        feature_structure=congress_feature_structure,
                        search_space=param_grid2,
                        max_iter=100,
                        data=data1
                        )

In [53]:
Annealer_RF2.preprocessor_step.fit(data1[0])

In [54]:
Annealer_RF2.simulation_annealing_fast()

Start SA with 100 iterations
Finished 100 SA iterations with best_score=1.0


({'n_estimators': 30,
  'criterion': 'entropy',
  'max_depth': 50,
  'min_samples_split': 4,
  'min_samples_leaf': 7,
  'max_features': 'log2',
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.03,
  'class_weight': 'balanced'},
 np.float64(1.0),
 12.465601921081543)