In [1]:
from flaml.automl import AutoML
from sklearn.metrics import accuracy_score
import pickle

In [2]:
import numpy as np
from numpy import mean
from numpy import std
from tqdm.notebook import tqdm
import os
import pandas as pd

In [3]:
def read_and_norm(dataset_path, type_,type):
    with open('{}/{}/{}.txt'.format(dataset_path, type_,type), 'rb') as f:
        matrix = [[float(x) for x in line.split()] for line in f]
    matrix = np.array(matrix)
    min_m = matrix.min().min()
    max_m = matrix.max().max()
    matrix = ((matrix - min_m) / (max_m - min_m))
    return matrix


In [4]:
def load_full_dataset(type_,dataset_path):
	classification = np.loadtxt('{}/{}/classification.txt'.format(dataset_path, type_))
	classification = np.array(classification).reshape(-1,1)

	with open('{}/{}/hr.txt'.format(dataset_path, type_), "r") as file:
		hr = []
		righe_con_9_colonne = []
		for indice, riga in enumerate(file):
			colonne = riga.split()
			if len(colonne) == 9:
				righe_con_9_colonne.append(indice)
			else:
				hr.append(colonne)
	hr = [[float(string) for string in inner] for inner in hr]
	hr = np.array(hr)


	shape = read_and_norm(dataset_path, type_,'shape')
	el = read_and_norm(dataset_path, type_,'el')
	dist = read_and_norm(dataset_path, type_,'dist')

	classification = np.delete(classification, righe_con_9_colonne, 0)
	shape = np.delete(shape, righe_con_9_colonne, 0)
	el = np.delete(el, righe_con_9_colonne, 0)
	dist = np.delete(dist, righe_con_9_colonne, 0)


	data_X = np.array([p for p in zip(shape, dist, el, hr)])
	#data_X = data_X.reshape(data_X.shape[0], data_X.shape[1], data_X.shape[2], 1)

	return(data_X,classification)


In [5]:
# summarize scores
def summarize_results(scores):
	#print(scores)
	m, s = mean(scores), std(scores)
	#print('Accuracy: %.3f%% (+/-%.3f)' % (m, s))
	return m

In [6]:
def evaluate_model_automl(train_x, train_y, test_x, test_y, log_file_path, time_limit):
    auto_ml = AutoML()

    # Define AutoML settings
    settings = {
        "time_budget": time_limit,  # Time budget in seconds
        "task": "classification",
        "metric": "accuracy",
        "log_file_name": log_file_path,
        #"verbosity": 2,
        "estimator_list": ["xgboost","rf","extra_tree","xgb_limitdepth","sgd","catboost","lrl1"]  # Specify models
    }

    # Train AutoML model
    auto_ml.fit(X_train=train_x, y_train=train_y, **settings)

    # Make predictions
    y_pred = auto_ml.predict(test_x)

    # Evaluate the model
    accuracy = accuracy_score(test_y, y_pred)
    return accuracy,auto_ml.model


In [7]:
results = {"File":[],
		   "Scores":[],
		   "Best_Accuracy":[]}

# run an experiment
def run_experiment(repeats=1):
	save_model_dir = "../models/3rd_phase"
	# Specify log file directory and name
	log_dir = "flaml_logs"

	directory = "../generated_data/training_folders"
	for file_path in tqdm(os.listdir(directory)):
		dataset_path = directory + "/" +file_path
		log_file_path = log_dir + "/" + file_path + ".log"
			
		#load data
		trainX, trainy = load_full_dataset('train',dataset_path)
		testX, testy = load_full_dataset('test',dataset_path)
		
		# update the shape to train ML models
		train_x = trainX.reshape(2990,40)
		train_y = trainy.reshape(2990,)
		test_x = testX.reshape(1282,40)
		test_y = testy.reshape(1282,)
        
        # repeat experiment
		scores = list()
		time_window = [30,100,300,500]
		for time_limit in time_window:
			score,model = evaluate_model_automl(train_x, train_y, test_x, test_y, log_file_path,time_limit)

			score = score * 100.0
			#print('>#%d: %.3f' % (r+1, score))
			scores.append(score)

			# Save the best model as a pickle file
			model_path = os.path.join(save_model_dir, file_path+".pkl")
			with open(model_path, "wb") as f:
				pickle.dump(model, f)
		#print(score)
		best_score = summarize_results(scores)
		
		# save it to dataframe
		results["File"].append(file_path)
		results["Scores"].append(scores)
		results["Best_Accuracy"].append(best_score)
		df = pd.DataFrame(results)
		max_accuracy_row = df.loc[df['Best_Accuracy'].idxmax()]
		print(max_accuracy_row)
		df.to_csv("../static/phase_3_results.csv", header=False)

run_experiment()

df = pd.DataFrame(results)
print(df.shape)
df.head()

  0%|          | 0/28 [00:00<?, ?it/s]

[flaml.automl.logger: 03-22 22:53:44] {1728} INFO - task = classification
[flaml.automl.logger: 03-22 22:53:44] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 03-22 22:53:44] {1838} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 03-22 22:53:44] {1955} INFO - List of ML learners in AutoML Run: ['xgboost', 'rf', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost', 'lrl1']
[flaml.automl.logger: 03-22 22:53:44] {2258} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 03-22 22:53:44] {2393} INFO - Estimated sufficient time budget=253s. Estimated necessary time budget=4s.
[flaml.automl.logger: 03-22 22:53:44] {2442} INFO -  at 0.0s,	estimator xgboost's best error=0.2558,	best estimator xgboost's best error=0.2558
[flaml.automl.logger: 03-22 22:53:44] {2258} INFO - iteration 1, current learner sgd
[flaml.automl.logger: 03-22 22:53:44] {2442} INFO -  at 0.2s,	estimator sgd's best error=0.2857,	best estimator xgboost's best error=0.2558
[flaml.aut

Unnamed: 0,File,Scores,Best_Accuracy
0,EISD840101,"[73.71294851794072, 75.19500780031201, 75.9750...",75.214509
1,KIDA850101,"[74.49297971918877, 74.25897035881435, 75.5070...",74.863495
2,GOLD730101,"[74.88299531981279, 75.27301092043682, 75.1950...",75.136505
3,CASG920101,"[72.46489859594384, 75.19500780031201, 74.4929...",74.278471
4,FASG890101,"[73.32293291731669, 74.88299531981279, 75.6630...",74.882995
