# Using auto-sklearn for numerical data

In this workbook, we use the auto-sklearn machine learning model to find the best model for the defect data from the previous chapters. 

In [1]:
!pip install -q auto-sklearn

In [3]:
!pip install -q openpyxl

In [4]:
# read the file with data using openpyxl
import pandas as pd

# we read the data from the excel file, 
# which is the defect data from the ant 1.3 system
dfDataCamel12 = pd.read_excel('./chapter_6_dataset_numerical.xlsx', 
                            sheet_name='camel_1_2',
                            index_col=0)

In [5]:
dfDataCamel12

Unnamed: 0_level_0,CBO,DCC,ExportCoupling,ImportCoupling,NOM,WMC,Defect
ClassName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
org.apache.camel.AlreadyStoppedException,1,0,1,0,0,0,1
org.apache.camel.AsyncCallback,8,0,12,0,1,1,1
org.apache.camel.AsyncProcessor,10,2,12,2,1,1,1
org.apache.camel.Body,0,0,0,0,0,0,0
org.apache.camel.CamelContext,27,9,47,9,17,17,1
...,...,...,...,...,...,...,...
org.apache.camel.util.jndi.JndiContext,4,1,1,3,35,73,1
org.apache.camel.view.GraphGeneratorSupport,9,4,2,8,10,23,0
org.apache.camel.view.NodeData,8,0,3,5,2,6,1
org.apache.camel.view.RouteDotGenerator,5,4,0,6,5,7,0


In [6]:
# prepare the dataset
import sklearn.model_selection

X = dfDataCamel12.drop(['Defect'], axis=1)
y = dfDataCamel12.Defect

X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=42, train_size=0.9)

In [12]:
import autosklearn.classification
cls = autosklearn.classification.AutoSklearnClassifier()
cls.fit(X_train, y_train)
predictions = cls.predict(X_test)

In [13]:
# now we can get the best models with their weights
cls.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
431,1,0.02,random_forest,0.209091,1.022652
473,29,0.02,random_forest,0.209091,1.185152
498,28,0.02,random_forest,0.209091,1.082618
511,27,0.02,random_forest,0.209091,0.994516
518,26,0.02,random_forest,0.209091,1.047253
529,25,0.02,random_forest,0.209091,1.005801
530,24,0.02,random_forest,0.209091,0.994972
552,23,0.04,random_forest,0.209091,1.000448
571,22,0.02,random_forest,0.209091,0.987272
598,21,0.04,random_forest,0.209091,0.97698


In [14]:
# and we can even take a look at the best models' scores
ensemble_dict = cls.show_models()
print(ensemble_dict)

{431: {'model_id': 431, 'rank': 1, 'cost': 0.2090909090909091, 'ensemble_weight': 0.02, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f438b5c4160>, 'balancing': Balancing(random_state=1, strategy='weighting'), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f43893eb6a0>, 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f43893eb070>, 'sklearn_classifier': RandomForestClassifier(criterion='entropy', max_features=3, min_samples_leaf=2,
                       min_samples_split=14, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)}, 473: {'model_id': 473, 'rank': 2, 'cost': 0.2090909090909091, 'ensemble_weight': 0.02, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f438b5c4f10>, 'balancing': Balancing(random_state=1, strateg

In [16]:
# and print the accuracy score
print(f"Accuracy score {sklearn.metrics.accuracy_score(y_test, predictions):.2f}")

Accuracy score 0.59


In [19]:
# and print the statistics for the model
print(cls.sprint_statistics())

auto-sklearn results:
  Dataset name: 4b131006-f653-11ed-814a-00155de31e8a
  Metric: accuracy
  Best validation score: 0.790909
  Number of target algorithm runs: 1273
  Number of successful target algorithm runs: 1214
  Number of crashed target algorithm runs: 59
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

