<a href="https://colab.research.google.com/github/ShaswataJash/Satlike/blob/main/RL_in_SATLike_tuning_with_formula_invariant_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Weighted MaxSAT formulas

In [None]:
!pip install gdown==3.6.4

In [None]:
#downloading from my personal Google drive
import gdown
gdown.download('https://drive.google.com/uc?id=1MO34-v5jO2FlgDjyTjIpuMznOTkWYLY0', 'mse17-incomplete-weighted-benchmarks.zip', quiet=False) #418MB
gdown.download('https://drive.google.com/uc?id=1kBVV3VFQXFPyVnu4jmtQRJz6SH5PBuFC', 'ms18_incomplete_wt.zip', quiet=False) #780MB


In [None]:
import shutil
import time

#unpacking of this zip file may take more than 30 seconds
start_time_of_unpacking = time.time()
shutil.unpack_archive('mse17-incomplete-weighted-benchmarks.zip', '.')
print("total time taken for unpacking = %s in seconds" % (time.time() - start_time_of_unpacking))

start_time_of_unpacking = time.time()
shutil.unpack_archive('ms18_incomplete_wt.zip', '.')
print("total time taken for unpacking = %s in seconds" % (time.time() - start_time_of_unpacking))

In [None]:
import glob
import pprint
formula_files17 = glob.glob('/content/mse17-incomplete-weighted-benchmarks' + '/**/*.wcnf.gz', recursive=True)
#pprint.pprint(formula_files17)
print("total formula17 file count=%s" % (len(formula_files17)))
formula_files18 = glob.glob('/content/maxsat_instances/ms_evals/MS18/mse18-incomplete-weighted-benchmarks' + '/**/*.wcnf.gz', recursive=True)
#pprint.pprint(formula_files18)
print("total formula18 file count=%s" % (len(formula_files18)))

# G++ version upgrade to G++8

In [None]:
!sudo apt update
!sudo apt-get install gcc-8 g++-8

In [None]:
!sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 80 --slave /usr/bin/g++ g++ /usr/bin/g++-8 
!sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 70 --slave /usr/bin/g++ g++ /usr/bin/g++-7 

In [None]:
!sudo update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-8 80 --slave /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-8
!sudo update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-7 70 --slave /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-7

In [None]:
!gcc --version
!x86_64-linux-gnu-gcc --version

# Satlike Git code download

In [None]:
!rm -Rf Satlike
!git clone https://github.com/ShaswataJash/Satlike.git
%cd /content/Satlike/

# Compiling Satlike executable with santize protection to find out memory corruption issues

In [None]:
#ensure libtcmalloc.so.4 is not preloaded. If that happens asan will not able to instrument malloc and free
!echo $LD_PRELOAD
%env LD_PRELOAD=
!echo $LD_PRELOAD

In [None]:
!g++ -O2 -Wall -g -fstack-protector-strong -D_FORTIFY_SOURCE=2 pms.cpp -o pms.out -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize=undefined

In [None]:
!ldd pms.out #verify that libasan is in order before any tcmalloc lib 

In [None]:
#refer: https://github.com/google/sanitizers/wiki/AddressSanitizerFlags#run-time-flags
#refer: https://gcc.gnu.org/onlinedocs/gcc/Instrumentation-Options.html
%env ASAN_OPTIONS=verbosity=2:detect_invalid_pointer_pairs=2

In [None]:
import os
import re
import gzip
import shutil

def decompress_formula(selected_formula_file):
    work_dir = '/tmp'
    filename = os.path.split(selected_formula_file)[-1]
    filename = re.sub(r"\.gz$", "", filename, flags=re.IGNORECASE)
    decompressed_filename = os.path.join(work_dir, filename)
    print("selected_formula_file = %s" % (decompressed_filename))
    with gzip.open(selected_formula_file, 'rb') as f_in: 
        with open(decompressed_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    return decompressed_filename


In [None]:
#testing single instance of formula execution
import random
random_selected_formula_file = random.choice(formula_files)
decompress_formula(random_selected_formula_file)
!sudo ./pms.out "/tmp/lisbon-wedding-lisbon-wedding-4-19.wcnf" #using sudo because asan lib will able to instrument printf only in priviledged mode

In [None]:
#testing all instances of formula execuation
import subprocess
for f in formula_files:
    decomp_f = decompress_formula(f)
    print(subprocess.run("sudo ./pms.out %s" % (decomp_f), shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE))

# SWIG installation for python interface of C++ Satlike

In [None]:
!wget http://prdownloads.sourceforge.net/swig/swig-4.0.2.tar.gz

In [None]:
!gunzip swig-4.0.2.tar.gz
!tar -xvf swig-4.0.2.tar

In [None]:
%cd swig-4.0.2/
!./configure
!make
!make install

In [None]:
%cd /content/Satlike/
!swig -c++ -python satlikew.i
!cat satlikew.i

In [None]:
%%writefile satlikew.cpp

#include "basis_pms.h"
#include "pms.h"

In [None]:
%%writefile setup.py

#!/usr/bin/env python

"""
setup.py file for satlikew
"""

from distutils.core import setup, Extension

#https://stackoverflow.com/questions/1676384/how-to-pass-flag-to-gcc-in-python-setup-py-script
satlikew_module = Extension('_satlikew',
                           sources=['satlikew_wrap.cxx', 'satlikew.cpp'],
                           #extra_compile_args=['-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize=leak -fsanitize=undefined']
                           #extra_link_args=['-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize=leak -fsanitize=undefined']
                           )

setup (name = 'satlikew',
       version = '0.1',
       author      = "Shaswata Jash",
       description = """To interact with SatLike3.0 algorithm from python""",
       ext_modules = [satlikew_module],
       py_modules = ["satlikew"],
       )

In [None]:
#refer http://www.swig.org/Doc4.0/Python.html#Python_nn20
!python setup.py build_ext --inplace

In [None]:
!ls -l /content/Satlike/_satlikew.cpython-36m-x86_64-linux-gnu.so

In [None]:
!ldd /content/Satlike/_satlikew.cpython-36m-x86_64-linux-gnu.so

In [None]:
import os
import satlikew
import time
import shutil
import re
import gzip

satlike = satlikew.Satlike()
try:
    gzipped_file_name = '/content/mse17-incomplete-weighted-benchmarks/correlation-clustering-Rounded_CorrelationClustering_Vowel_BINARY_N760_D0.200.wcnf.gz'
    work_dir = '/tmp'
    filename = os.path.split(gzipped_file_name)[-1]
    filename = re.sub(r"\.gz$", "", filename, flags=re.IGNORECASE)
    decompressed_filename = os.path.join(work_dir, filename)

    print("decompressed_filename = %s" % (decompressed_filename))
    #refer: https://stackoverflow.com/questions/55040442/how-to-register-gz-format-in-shutil-register-archive-format-to-use-same-format
    with gzip.open(gzipped_file_name, 'rb') as f_in: 
        with open(decompressed_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    satlike.build_instance(decompressed_filename);
    satlike.algo_init(1, todebug=False) #no-randomization (i.e. fixed seed value of one)
    last_soft_unsat_weight = satlike.get_total_soft_weight()+1
    start_time = time.time()
    break_from_outer_loop = False
    while break_from_outer_loop == False:
        satlike.init_with_decimation_stepwise();
        current_step = 1
        while current_step < satlike.get_max_flips():
            current_step += 1
            satlike.local_search_stepwise(15, 1e-07, 300, 500, current_step, False)
            if (satlike.get_hard_unsat_nb() == 0) and (satlike.get_opt_unsat_weight() < last_soft_unsat_weight):
                print("opt_unsat_weight = %s time-taken in sec=%s" % (satlike.get_opt_unsat_weight(), time.time() - start_time))
                last_soft_unsat_weight = satlike.get_opt_unsat_weight()

            if last_soft_unsat_weight == 0:
                break_from_outer_loop = True
                break
            
            if time.time() - start_time > 60:
                break_from_outer_loop = True
                break

finally:
    satlike.free_memory()


# RAY(RLLib) CLUSTER SETUP

In [None]:
!whoami
!pwd
!python -V
!pip install ray[rllib]

In [None]:
!pip install psutil #will be used by ray to print log_sys_usage
!pip install gputil

In [None]:
!ray stop
!rm -Rf /tmp/ray
!rm -Rf ~/ray_results
!rm -Rf /tmp/decompressed_formula
!mkdir /tmp/decompressed_formula
#!ray start --help
#Without explicit information about binding dashboard-host to 127.0.0.1, dashboard can't be connected in google-colab
!ray start --head --port=6379 --object-manager-port=8076 --include-dashboard True --dashboard-host 127.0.0.1 --dashboard-port 8265 &

In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(8265)"))

In [None]:
%load_ext tensorboard
%tensorboard --logdir ~/ray_results

## Reinforcement Learning - MaxSAT problem Environment

In [None]:
import numpy as np
import gym
from gym import spaces
from ray.tune.registry import register_env

import satlikew
import time
import shutil
import os
import re
import gzip
import sys

import random

NUMBER_OF_ENV_PER_WORKER = 8

# {x - (-1)} / {1 - (-1)} = {X - X_min} / {X_max - X_min}
# X = 1/2 * (x + 1) * (X_max - X_min) + X_min
def upscale(x, X_min, X_max):
    assert x >= -1
    assert x <= 1
    assert X_min < X_max
    return 0.5 * (x + 1) * (X_max - X_min) + X_min


# {x - (-1)} / {1 - (-1)} = {X - X_min} / {X_max - X_min}
# x =  ({X - X_min} / {X_max - X_min}) * 2 - 1 
def downscale(X, X_min, X_max):
    assert X >= X_min, "X=%s >= X_min=%s" % (X, X_min)
    assert X <= X_max, "X=%s <= X_max=%s" % (X, X_max)
    assert X_min < X_max, "X_min=%s < X_max=%s" % (X_min, X_max)
    return ((X - X_min) / (X_max - X_min)) * 2 - 1

class SATLikeHyperParamTuneEnvFormulaInvariant(gym.Env):

    def __init__(self, env_config):
        self.mode = env_config["mode"]
        self.verbose = env_config["verbose"]
        self.worker_index = env_config.worker_index
        self.vector_index = env_config.vector_index
        self.decompressed_filename = None
        self.list_of_formula_files = env_config["list_of_formula_files"]
        assert len(self.list_of_formula_files) > 0
        self.episode_time_in_sec = env_config["episode_time_in_sec"]
        assert self.episode_time_in_sec != None and self.episode_time_in_sec > 0
        self.h_inc_eta_max = env_config["h_inc_eta_max"]
        assert self.h_inc_eta_max != None and self.h_inc_eta_max >= 1
        self.formula_choice_random = env_config["formula_choice_random"]
        assert self.formula_choice_random != None and type(self.formula_choice_random)==bool
        self.horizon_steps = env_config["episode_horizon"]

        self.sat_like = None
        self.current_tries = -1
        self.current_step = -1
        self.algorithm_state = 0
        self.last_soft_unsat_weight = -1
        self.start_time = -1

        self.number_of_rl_timestep = 0
        self.total_reward = 0

        #1st dimension for t, 2nd dimension for sp, 3rd dimension for h_inc, 4th dimension for eta
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(4,), dtype=np.float32) #for normalization, values should be in between [-1,1]

        #(a) suppose one of the feature can be fraction of hard-clauses unsatisfied with respect to the total number of hard-clauses present in the formula for the current assignment of SATLike iteration. We should emphasize here on ‘fraction’ as this will essentially convert this feature as formula-invariant as it will normalize the value with respect to total of number of hard-clauses present in the formula.
        #(b) Similarly, another formula-invariant feature can be ‘fraction’ of sum of satisfied soft-clause weights with respect to total sum of all soft-clause weights
                
        #for normalization, values should be in between [-1,1]
        self.observation_space = spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32)

    def __getObservationAndReward(self, r, sp, h_inc, eta):
        assert self.algorithm_state == 1 or self.algorithm_state == 2

        if self.verbose > 2: print("[W:%s, E:%s] r=%s sp=%s h_inc=%s eta=%s" % (self.worker_index, self.vector_index, r, sp, h_inc, eta))

        if self.algorithm_state == 1:
            self.current_tries += 1
            self.current_step = 0
            self.sat_like.init_with_decimation_stepwise()
            self.algorithm_state = 2
        
        if self.current_step < self.sat_like.get_max_flips():
            self.current_step += 1
            self.sat_like.local_search_stepwise(r, sp, h_inc, eta, self.current_step, False)
                        
            if (self.sat_like.get_hard_unsat_nb() == 0) and (self.sat_like.get_opt_unsat_weight() < self.last_soft_unsat_weight):
                if self.verbose > 1: print("[W:%s, E:%s] rl-step=%s try=%s step=%s opt_unsat_weight = %s time-taken in sec=%s" 
                                           % (self.worker_index, self.vector_index, self.number_of_rl_timestep, self.current_tries, self.current_step, 
                                              self.sat_like.get_opt_unsat_weight(), time.time() - self.start_time))
                self.last_soft_unsat_weight = self.sat_like.get_opt_unsat_weight()
            else:
                if self.verbose > 2: print("[W:%s, E:%s] rl-step=%s try=%s step=%s hard_unsat_nb=%s soft_unsat_weight = %s" 
                                           % (self.worker_index, self.vector_index, self.number_of_rl_timestep, self.current_tries, self.current_step, 
                                              self.sat_like.get_hard_unsat_nb(), self.sat_like.get_soft_unsat_weight()))

            if 0 == self.last_soft_unsat_weight:
                self.algorithm_state = 1

        else:
            self.algorithm_state = 1

        assert self.sat_like.get_hard_unsat_nb() >= 0
        obs1 = 0 if self.sat_like.get_hard_unsat_nb() == 0 else (self.sat_like.get_hard_unsat_nb() / self.sat_like.get_num_hclauses())

        assert self.sat_like.get_soft_unsat_weight() >= 0 and (self.sat_like.get_total_soft_weight() >= self.sat_like.get_soft_unsat_weight()),  \
               "%s total_soft_weight=%s soft_unsat_weight=%s" % (self.decompressed_filename, self.sat_like.get_total_soft_weight() , self.sat_like.get_soft_unsat_weight())
        obs2 = (self.sat_like.get_total_soft_weight() - self.sat_like.get_soft_unsat_weight()) / self.sat_like.get_total_soft_weight()

        downscaled_obs1 = downscale(obs1, 0, 1.0) #rescale between [-1,1]
        downscaled_obs2 = downscale(obs2, 0, 1.0) #rescale between [-1,1]
        obs_arr = np.array([downscaled_obs1, downscaled_obs2])
        assert obs_arr.shape == self.observation_space.shape
        assert np.max(obs_arr) <= 1.0
        assert np.min(obs_arr) >= -1.0

        if self.sat_like.get_hard_unsat_nb() > 0:
            reward = -(obs1 + 2)
            assert -3 <= reward and reward <= -2
        else:
            #best positive reward can be +2 (positive reward has two parts - (a)unsat weight reduction (b)how fast reduction can be acheived)
            reward =  obs2 +  (self.horizon_steps - self.number_of_rl_timestep)/self.horizon_steps #reward according to best solution in self.EPISODE_TIME_IN_SEC sec
            assert 0 < reward and reward <= 2
        return obs_arr, reward

    def close(self):
        if self.sat_like != None:
            if self.verbose > 0 :
                best_cost_desc = self.last_soft_unsat_weight if (self.last_soft_unsat_weight < self.sat_like.get_total_soft_weight()+1) else 'UNDETERMINED'
                print("[W:%s, E:%s] rl_step_count = %s avg_reward=%s best_cost=%s " % (self.worker_index, self.vector_index, self.number_of_rl_timestep, 
                      self.total_reward/self.number_of_rl_timestep, best_cost_desc))
            
            self.sat_like.free_memory()
            self.sat_like = None
            os.remove(self.decompressed_filename)
            self.decompressed_filename = None
        
    def reset(self):

        self.close()

        self.number_of_rl_timestep = 0
        self.total_reward = 0
        random_selected_formula_file = random.choice(self.list_of_formula_files) if self.formula_choice_random \
                                           else self.list_of_formula_files[self.worker_index*NUMBER_OF_ENV_PER_WORKER + self.vector_index] 
        
        work_dir = '/tmp/decompressed_formula'
        filename = os.path.split(random_selected_formula_file)[-1]
        filename = re.sub(r"\.gz$", "", filename, flags=re.IGNORECASE)
        woker_env_separated_filename = 'f_%s_%s_%s_%s' % (time.time(), self.worker_index,self.vector_index,filename)
        self.decompressed_filename = os.path.join(work_dir, woker_env_separated_filename)

        if self.verbose > 0: print("[W:%s, E:%s] selected_formula_file = %s" % (self.worker_index, self.vector_index, self.decompressed_filename))
        with gzip.open(random_selected_formula_file, 'rb') as f_in: 
            with open(self.decompressed_filename, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        self.sat_like = satlikew.Satlike()
        self.sat_like.build_instance(self.decompressed_filename)
        self.sat_like.algo_init(1, todebug=False) #no-randomization (i.e. fixed seed value of one)
        if self.verbose > 0: print("[W:%s, E:%s] formula summary: var_count=%s hard_clause_count=%s soft_clause_count=%s max_soft_weight=%s total_soft_clause_weight=%s" 
              % (self.worker_index, self.vector_index, self.sat_like.get_num_vars(), self.sat_like.get_num_hclauses(), 
                 self.sat_like.get_num_sclauses(), self.sat_like.get_top_clause_weight(), self.sat_like.get_total_soft_weight()))
        
        assert self.sat_like.get_num_vars() > 0
        assert self.sat_like.get_num_hclauses() >= 0 #in some formula, there can be only soft clauses
        assert self.sat_like.get_num_sclauses() > 0
        assert self.sat_like.get_total_soft_weight() > 0
        assert self.sat_like.get_top_clause_weight() > 0
        
        self.current_tries = 0
        self.current_step = -1
        self.algorithm_state = 1
        self.last_soft_unsat_weight = self.sat_like.get_total_soft_weight()+1
        self.start_time = time.time()

        obs, reward =  self.__getObservationAndReward(self.sat_like.get_hd_count_threshold(), self.sat_like.get_smooth_probability(),
                                                      self.sat_like.get_h_inc(), self.sat_like.get_softclause_weight_threshold())
        return obs

    def step(self, action):
        assert action.shape == self.action_space.shape, str(action)
        assert np.max(action) <= 1.0, str(action)
        assert np.min(action) >= -1.0,  str(action)
        scaled_t = int(upscale(action[0], 1, self.sat_like.get_num_vars()))
        scaled_sp = upscale(action[1], 0, 1)
        #to_swt_to_num_sc_ratio = max(2, self.sat_like.get_top_clause_weight()/self.sat_like.get_num_sclauses()) #note: min of h_inc = 1, thus max can be 2 or higher
        scaled_h_inc = int(upscale(action[2], 1, self.h_inc_eta_max)) #TODO: what should be the upper-bound
        scaled_eta = int(upscale(action[3], 1, self.h_inc_eta_max)) #TODO: what should be the upper-bound

        self.number_of_rl_timestep += 1
        
        obs , reward = self.__getObservationAndReward(scaled_t,scaled_sp,scaled_h_inc,scaled_eta)
        self.total_reward += reward
        done = True if (self.last_soft_unsat_weight == 0) or ((time.time() - self.start_time) > self.episode_time_in_sec) else False
        info = {}
        return obs, reward, done, info

register_env("satlike_env", lambda env_config: SATLikeHyperParamTuneEnvFormulaInvariant(env_config))

## PPO general configuration setup

In [None]:
import ray
from ray import tune
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print
import pprint
import copy
import random

config = copy.deepcopy(ppo.DEFAULT_CONFIG)
import tensorflow as tf
device_name = tf.test.gpu_device_name()
print('Found GPU at: {}'.format(device_name))
gpu_count = 0
if device_name == '/device:GPU:0':
    gpu_count = 1
config["num_gpus"] = gpu_count 
config["num_cpus_for_driver"] = 0.25
config["num_cpus_per_worker"] = 0.5
#harcoding to 4, so that for each RL experiment constant 2 CPU worth worker-resource exhausted (config["num_cpus_per_worker"] = 0.5). 
#This ensures multiple tune-trails can run in parallel according to total cpu availiable in the cluster
config["num_workers"] = 4  
config["num_envs_per_worker"] = NUMBER_OF_ENV_PER_WORKER
config["rollout_fragment_length"] = 64
config["train_batch_size"] = config["num_workers"] * config["num_envs_per_worker"] * config["rollout_fragment_length"]
config["sgd_minibatch_size"] = min(config["sgd_minibatch_size"],config["train_batch_size"])
config["horizon"] = config["rollout_fragment_length"] * 2000 #so that it is always proper multiple of config["rollout_fragment_length"]
config["preprocessor_pref"] = None
#config["framework"] = 'torch'
config["clip_rewards"] = False #as we don't want reward to be cliped in between Tuple[value1, value2]: Clip at value1 and value2.
config["vf_clip_param"] = abs(-3 * config["horizon"]) #dividing with max possible value of reward
config["clip_actions"] = True #without that we are getting action beyond -1 and 1
config["lr"] = 0.000001
config["ignore_worker_failures"] = True

training_env_config = {
    "mode": 'training',
    "episode_horizon": config["horizon"],
    "episode_time_in_sec": 300, 
    "h_inc_eta_max": 10000, #POTENTIAL HYPER_PARAM TUNE CANDIDATE
    "list_of_formula_files": formula_files17,
    "formula_choice_random": True,
    "verbose": 0
}

eval_env_config = {
    "mode": 'eval', 
    "episode_horizon": config["horizon"],
    "episode_time_in_sec": 3600, #intentionally keeping it high so that validation episodes are predominently terminated using horizon condition (it will improve predictibility)
    "h_inc_eta_max": training_env_config["h_inc_eta_max"], 
    "list_of_formula_files": random.sample(formula_files18, 2 * NUMBER_OF_ENV_PER_WORKER),
    "formula_choice_random": False,
    "verbose": 1
}
config["env_config"] = training_env_config

'''
config["evaluation_interval"] = 15 #after every 15 training iteration
#config['evaluation_num_workers'] =2
config['evaluation_num_episodes'] = max(1,config['evaluation_num_workers']) * config["num_envs_per_worker"]
config['evaluation_config']['env_config'] = eval_env_config
assert config['evaluation_num_episodes'] <= len(eval_env_config['list_of_formula_files'])
'''
pprint.pprint(config)


In [None]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!nvidia-smi

## Tune experiment

In [None]:
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.trial import Trial
import time
import shutil
import os
import math
import copy 

ray.shutdown()
ray.init(address='auto', ignore_reinit_error=True)
print('''This cluster consists of {} nodes in total {} CPU resources in total '''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))

MAX_NUMBER_OF_EXPERIMENT = 50
MAX_TRAINING_ITERATION_PER_TRIAL = 50
CHOSEN_METRIC = 'episode_reward_mean'

def tune_ppo_in_satlike(search_config):
    start_time_of_training = time.time()
    print('Started at %s for %s' % (start_time_of_training , search_config))
    local_config = copy.deepcopy(config)
    local_config["num_sgd_iter"] = int(search_config["num_sgd_iter"])
    local_config["clip_param"] = search_config["clip_param"]
    local_config["kl_coeff"] = search_config["kl_coeff"]
    local_config["kl_target"] = search_config["kl_target"]
    local_config["rollout_fragment_length"] = search_config["rollout_fragment_length"]
    local_config["train_batch_size"] = local_config["num_workers"] * local_config["num_envs_per_worker"] * local_config["rollout_fragment_length"]
    local_config["sgd_minibatch_size"] = min(search_config["sgd_minibatch_size"],local_config["train_batch_size"])
    local_config["gamma"] = search_config["gamma"]
    local_config["lambda"] = search_config["lambda"]
    local_config["vf_loss_coeff"] = search_config["vf_loss_coeff"]
    local_config["entropy_coeff"] = search_config["entropy_coeff"]
    #local_config["lr"] = search_config["lr"]
    local_config["model"]["use_lstm"] = search_config["use_lstm"]
    local_config["model"]["lstm_use_prev_action_reward"] = search_config["lstm_use_prev_action_reward"]
    local_config["model"]["max_seq_len"] = int(search_config["max_seq_len"])
    local_config["exploration_config"]["type"] = search_config["exploration_strategy"]
    local_config["vf_share_layers"] = search_config["vf_share_layers"]
    trainer = ppo.PPOTrainer(config=local_config, env=SATLikeHyperParamTuneEnvFormulaInvariant)
    effective_iteration = 0
    while effective_iteration < MAX_TRAINING_ITERATION_PER_TRIAL:
        # Perform one iteration of training the policy with PPO
        result = trainer.train()
        #print(pretty_print(result))
        if result['episodes_total'] > 0 and math.isnan(result[CHOSEN_METRIC]) == False:
            tune.report(episode_reward_mean=result[CHOSEN_METRIC])
            effective_iteration += 1
    print('Ending at %s episode=%s mean-reward=%s' % (time.time(), result['episodes_total'], result[CHOSEN_METRIC]))
    trainer.cleanup()
    trainer = None
    local_config = None

class TimeStopper(Stopper):
    def __init__(self, d):
        self._start = time.time()
        self._deadline = d

    def __call__(self, trial_id, score):
        return False

    def stop_all(self):
        return time.time() - self._start > self._deadline

class TrialTerminationReporter(CLIReporter):
    def __init__(self):
        super(TrialTerminationReporter, self).__init__()
        self.num_terminated = 0

    def should_report(self, trials, done=False):
        """Reports only on trial termination events."""
        old_num_terminated = self.num_terminated
        self.num_terminated = len([t for t in trials if t.status == Trial.TERMINATED])
        return self.num_terminated > old_num_terminated

analysis = tune.run(
    tune_ppo_in_satlike,
    name="PPO-IN-SatLike",
    resources_per_trial={
        "cpu": 0.5,
        "extra_cpu": 3.5, #refer: {config["num_cpus_per_worker"] = 0.25 * config["num_workers"] = 8} + {config["num_cpus_per_driver"] = 0.25} 
    },
    metric = CHOSEN_METRIC,
    mode = 'max',
    num_samples=MAX_NUMBER_OF_EXPERIMENT, #conduct MAX_NUMBER_OF_EXPERIMENT random experiment under ASHA (early stopping)
    scheduler=ASHAScheduler(),
    config={
       #refer: https://medium.com/aureliantactics/ppo-hyperparameters-and-ranges-6fc2d29bccbe
       "rollout_fragment_length": tune.choice([32, 64, 128, 256]),
       "sgd_minibatch_size": tune.choice([64, 128, 256, 512, 1024]),
       "num_sgd_iter": tune.uniform(3,30),
       "clip_param" : tune.choice([0.1, 0.2, 0.3]),
       "kl_coeff" : tune.uniform(0.3,1),
       "kl_target" : tune.uniform(0.003,0.03),
       "gamma": tune.uniform(0.8,0.9997),
       "lambda": tune.uniform(0.9,1),
       "vf_loss_coeff": tune.uniform(0.5, 1),
       "entropy_coeff": tune.uniform(0, 0.01),
       #"lr" : tune.uniform(0.003,5e-6),
       "use_lstm": tune.choice([True,False]),
       "lstm_use_prev_action_reward": tune.choice([True,False]),
       "max_seq_len": tune.uniform(2,30),
       #Only (Multi)Discrete action spaces supported for 'Curiosity' so far! 
       #EpsilonGreedy also shown issue related shape missmatch - need to investigate further 
       #For ParameterNoise, postprocess_trajectory() throws NotImplementedError
       "exploration_strategy": tune.choice(['GaussianNoise', 'OrnsteinUhlenbeckNoise', 'StochasticSampling']),
       "vf_share_layers": tune.choice([True,False])
    },
    checkpoint_at_end=True,
    fail_fast=True,
    progress_reporter = TrialTerminationReporter()
)
print("Best config: ", analysis.get_best_config(metric=CHOSEN_METRIC))
ray.shutdown()

## Single Reinforcement-learning (RL) experiment

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
import time
import shutil
import os
import math

ray.shutdown()
ray.init(address='auto', ignore_reinit_error=True)
print('''This cluster consists of {} nodes in total {} CPU resources in total '''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))

tuned_config = copy.deepcopy(config)
tuned_config["num_sgd_iter"] = 26
tuned_config["clip_param"] = 0.1
tuned_config["kl_coeff"] = 0.523515
tuned_config["kl_target"] = 0.0280057
tuned_config["rollout_fragment_length"] = 32
tuned_config["train_batch_size"] = tuned_config["num_workers"] * tuned_config["num_envs_per_worker"] * tuned_config["rollout_fragment_length"]
tuned_config["sgd_minibatch_size"] = min(128,tuned_config["train_batch_size"])
tuned_config["gamma"] = 0.9749
tuned_config["lambda"] = 0.922487
tuned_config["vf_loss_coeff"] = 0.550416
tuned_config["entropy_coeff"] = 0.00990254
tuned_config["model"]["use_lstm"] = True
tuned_config["model"]["lstm_use_prev_action_reward"] = False
tuned_config["model"]["max_seq_len"] = 27
tuned_config["exploration_config"]["type"] = 'OrnsteinUhlenbeckNoise'
tuned_config["vf_share_layers"] = True

trainer = ppo.PPOTrainer(config=config, env="satlike_env")
#trainer.restore('/gdrive/My Drive/Colab Notebooks/capstone_proj1/best_checkpointed/best_checkpointed')#mean reward=-228.34249603086218

class EvalEnvDictWrap(dict):
    def __init__(self, env_conf, w_i, v_i):
        for k,v in env_conf.items():
            super().__setitem__(k, v)
        self.worker_index = w_i
        self.vector_index = v_i

def compare_with_original_algo(formula_file, max_iteration):
    satlike = satlikew.Satlike()
    satlike.build_instance(formula_file)
    satlike.algo_init(1, todebug=False) #no-randomization (i.e. fixed seed value of one)

    last_soft_unsat_weight = satlike.get_total_soft_weight()+1
    start_time = time.time()
    break_from_outer_loop = False
    iteration_count = 0
    while break_from_outer_loop == False:
        satlike.init_with_decimation_stepwise();
        current_step = 1
        while current_step < satlike.get_max_flips():
            current_step += 1
            satlike.local_search_stepwise(satlike.get_hd_count_threshold(), 
                                          satlike.get_smooth_probability(), 
                                          satlike.get_h_inc(), 
                                          satlike.get_softclause_weight_threshold(), 
                                          current_step, False)
            if (satlike.get_hard_unsat_nb() == 0) and (satlike.get_opt_unsat_weight() < last_soft_unsat_weight):
                print("opt_unsat_weight = %s time-taken in sec=%s" % (satlike.get_opt_unsat_weight(), time.time() - start_time))
                last_soft_unsat_weight = satlike.get_opt_unsat_weight()

            if last_soft_unsat_weight == 0:
                break_from_outer_loop = True
                break
            
            iteration_count += 1
            if iteration_count >= max_iteration:
                break_from_outer_loop = True
                break 

    satlike.free_memory()
    return last_soft_unsat_weight

def validate_on_unseen_data(my_trainer, max_iteration=30000, verbose=0):
    try:
        pol = my_trainer.get_policy()
    except AttributeError:
        pol = my_trainer.policy
    
    eval_episode_sat_soft_clause = 0
    comparative_result_from_original_algo = 0
    for i in range(len(eval_env_config["list_of_formula_files"])):
        eval_env = SATLikeHyperParamTuneEnvFormulaInvariant(
                EvalEnvDictWrap(eval_env_config,i//NUMBER_OF_ENV_PER_WORKER,i%NUMBER_OF_ENV_PER_WORKER))

        obs = eval_env.reset()
        action_in = None
        reward_in = None
        state_in = pol.model.get_initial_state()
        for _ in range(max_iteration):

            #refer: https://github.com/ray-project/ray/blob/master/rllib/utils/test_utils.py (refer: check_compute_single_action())
            action, state, _ = pol.compute_single_action(
                    obs,
                    state_in,
                    prev_action=action_in,
                    prev_reward=reward_in,
                    clip_actions=tuned_config["clip_actions"],
                    explore=False)
            action_in = action
            state_in = state
            obs, reward, done, info = eval_env.step(action)
            reward_in = reward
        
        unsat_soft_clause = eval_env.last_soft_unsat_weight if (eval_env.last_soft_unsat_weight < eval_env.sat_like.get_total_soft_weight()+1) else -1
        if unsat_soft_clause >= 0:
            eval_episode_sat_soft_clause += (eval_env.sat_like.get_total_soft_weight() - unsat_soft_clause)
        
        last_soft_unsat_weight = compare_with_original_algo(eval_env.decompressed_filename,max_iteration)
        unsat_soft_clause = last_soft_unsat_weight if (last_soft_unsat_weight < eval_env.sat_like.get_total_soft_weight()+1) else -1
        if unsat_soft_clause >= 0:
            comparative_result_from_original_algo += (eval_env.sat_like.get_total_soft_weight() - unsat_soft_clause)

        eval_env.close()

    if verbose > 0: print("******** eval_episode_sat_soft_clause = %s comparative_result_from_original_algo=%s ******" 
                          % (eval_episode_sat_soft_clause, comparative_result_from_original_algo))
    return eval_episode_sat_soft_clause, comparative_result_from_original_algo

best_episode_reward_mean = None
start_time_of_training = time.time()
while time.time() - start_time_of_training < (120 * 60): #120 minutes 
    # Perform one iteration of training the policy with PPO
    result = trainer.train()
    validate_on_unseen_data(trainer, verbose=1)
    #print(pretty_print(result))
    
    '''
    if best_episode_reward_mean == None:
        if result['episodes_total'] > 0 and math.isnan(result['episode_reward_mean']) == False:
            best_episode_reward_mean = result['episode_reward_mean']
            print("Initially: ", best_episode_reward_mean)
            validate_on_unseen_data(trainer, verbose=1)
    else: 
        if result['episode_reward_mean'] > best_episode_reward_mean:
            best_episode_reward_mean = result['episode_reward_mean']
            checkpoint = trainer.save()
            print(best_episode_reward_mean, checkpoint)
            validate_on_unseen_data(trainer, verbose=1)
    '''
            
trainer.cleanup()
ray.shutdown()

In [None]:
!cp -Rf ~/ray_results '/gdrive/My Drive/Colab Notebooks/capstone_proj1/' 

In [None]:
!ray stop -v