### Asynchronous SGD updates using Distributed Cache Redis

1. Create spot instances
2. Mount EFS (Not required for our project)
3. Start redis server in all the instances and create a cluster
4. Pull the code from github repo (https://github.com/SrujithPoondla/vanilla-hogwild.git)
5. If need to divide the dataset between the nodes run the specific cell
6. Run the scripts to start training
7. After training ends close the instances

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from aws_setup import *
from argparse import ArgumentParser
import boto3
import os 
sys.argv = ['foo']

#### Define parameters

In [3]:
parser = ArgumentParser(description='Asynchronous SGD updates using Redis')
parser.add_argument('--n-nodes', type=int, default=1, metavar='N',
                    help='how many aws instances to start')
parser.add_argument('--is-redis', action='store_true', default = True,
                    help="Choose whether the model to be trained using redis or not."
                    "If not using Redis, model will be trained on single process")


_StoreTrueAction(option_strings=['--is-redis'], dest='is_redis', nargs=0, const=True, default=True, type=None, choices=None, help='Choose whether the model to be trained using redis or not.If not using Redis, model will be trained on single process', metavar=None)

In [4]:
args = parser.parse_args()
vpc_name='vpc-1b056b60'
args.n_nodes = 1
n_instances = args.n_nodes
instance_type = 'm5.2xlarge'
ami_sr = 'ami-c2f670bd'
a_zone = 'us-east-1a'

if args.n_nodes is 2 and args.is_redis:
    print('Cant create a cluster with 2 redis nodes. Chose either a 3 node cluster or single instance')

#### Get Existing VPC by tag name

In [5]:
vpc = get_vpc(vpc_name); vpc

ec2.Vpc(id='vpc-1b056b60')

#### Create EFS (if you haven't already)

In [6]:
# efs_tag = f'{vpc_name}-efs'

In [7]:
# efs = create_efs(efs_tag, vpc, performance_mode='maxIO')

#### Request Spot instance

In [8]:
instance_name = f'{vpc_name}-instance'
# Recommend a high compute instance as we need to do multi-threaded resizing later on

In [9]:
spot_price = get_spot_prices()[instance_type]
bid_price = "%.4f" % (float(spot_price)*3)
print(f'Spot price: {spot_price}, Bid price: {bid_price}')

Spot price: 0.137900, Bid price: 0.4137


In [10]:
launch_specs = LaunchSpecs(vpc, instance_type=instance_type, ami= ami_sr,availability_zone=a_zone).build()

In [11]:
# launch_specs['BlockDeviceMappings'][0]['Ebs']['VolumeSize'] = 1000

In [12]:
launch_specs

{'BlockDeviceMappings': [{'DeviceName': '/dev/sda1',
   'Ebs': {'DeleteOnTermination': True,
    'VolumeSize': 20,
    'VolumeType': 'gp2'}}],
 'ImageId': 'ami-c2f670bd',
 'InstanceType': 'm5.2xlarge',
 'KeyName': 'aws-key-spot-instance',
 'NetworkInterfaces': [{'AssociatePublicIpAddress': True,
   'DeviceIndex': 0,
   'Groups': ['sg-2624da6f'],
   'SubnetId': 'subnet-10d9d04d'}]}

In [13]:
ec2 = boto3.resource('ec2')
filters = [
    {
        'Name': 'instance-state-name',
        'Values': ['running']
    }
]
# filter the instances based on filters() above
ec2_instances = list(ec2.instances.filter(Filters=filters))
instances = []
for instance in ec2_instances:
    instances.append(instance)
instances_to_request = n_instances-len(instances)
instances = create_multiple_spot_instance(instance_name, launch_specs,instance_count=args.n_nodes, spot_price=bid_price)
print(instances)

Keypair exists
Waiting on spot fullfillment...
Fulfillment completed. InstanceId: i-046f3edc26f8125b0
Rebooting...
Completed. SSH:  ssh -i ~/.ssh/aws-key-spot-instance.pem ubuntu@54.209.22.20
[ec2.Instance(id='i-046f3edc26f8125b0')]


In [14]:
# instance = get_instance(instance_name); instance
ssh_commands = []
public_ip_list = []
private_ip_list = []
for instance in instances:
    # for each instance, append to lists
    private_ip_list.append(instance.private_ip_address)
    public_ip_list.append(instance.public_ip_address)
    ssh_commands.append(get_ssh_command(instance))
print(ssh_commands, public_ip_list, private_ip_list)

['ssh -i ~/.ssh/aws-key-spot-instance.pem ubuntu@54.209.22.20'] ['54.209.22.20'] ['10.0.0.12']


#### Terminating instances

In [31]:
# for instance in instances:
#     print(instance.terminate())

{'TerminatingInstances': [{'CurrentState': {'Code': 32, 'Name': 'shutting-down'}, 'InstanceId': 'i-046f3edc26f8125b0', 'PreviousState': {'Code': 16, 'Name': 'running'}}], 'ResponseMetadata': {'RequestId': '691dcf19-f17e-460f-8b4d-6485c4040e7c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8', 'transfer-encoding': 'chunked', 'vary': 'Accept-Encoding', 'date': 'Sat, 05 May 2018 17:13:34 GMT', 'server': 'AmazonEC2'}, 'RetryAttempts': 0}}


### SSH

In [16]:

def get_ssh_clients(instances):
    clients = []
    for instance in instances:
        clients.append(connect_to_instance(instance))
    print(clients)
    return clients
clients = get_ssh_clients(instances)

Connecting to SSH...
Got client
/Users/srujithpoondla/.ssh/aws-key-spot-instance.pem
Exception: timed out Retrying...
Connected!
[<paramiko.client.SSHClient object at 0x10b7dd780>]


#### Mount EFS

In [17]:
# efs_addr = get_efs_address('fast-ai-efs'); efs_addr

In [18]:
# _ = run_command(client, 'mkdir ~/efs_mount')

In [19]:
# efs_mount_cmd = f'sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 {efs_addr}:/ ~/efs_mount'
# _ = run_command(client, efs_mount_cmd)

In [20]:
# _ = run_command(client, 'ls efs_mount') # no reformatting

## Tmux

In [21]:
def get_tmux_sess(clients):
    tsess = []
    for client in clients:
        if 'sess' not in run_command(client,'tmux ls'):
            tsess.append(TmuxSession(client, 'sess'))         
    print(tsess)
    return tsess
tsess = get_tmux_sess(clients)

[<aws_setup.TmuxSession object at 0x107ef8208>]


#### Activate Conda Environment in all the instances and check whether we need to create a cluster or not. Then start redis using conf files.

In [22]:
for client,sess in zip(clients,tsess):
    if 'redis' not in run_command(client,'tmux ls'):
        redis = TmuxSession(client, 'redis-sess')
    run_command(client, 'cd ~/vanilla-hogwild && git stash && git pull && git checkout stable')
    if (args.n_nodes >= 3):
        print(redis.run_command('~/miniconda3/envs/largescale/bin/redis-server ~/redis-conf/redis_cluster.conf'))
    else:
        print(redis.run_command('~/miniconda3/envs/largescale/bin/redis-server ~/redis-conf/redis.conf'))
  

('', '')


#### Create redis cluster

In [23]:
if args.n_nodes >= 3:
    ip_str = ''
    for ip in private_ip_list:
        ip_str = ip_str+ ip +":6379 "
    print(ip_str)
    redis = TmuxSession(clients[0],'redis-serv-sess')
    redis.run_command('cd /home/ubuntu/redis-4.0.9/src && ./redis-trib.rb create '+ ip_str + '&& yes')
#     redis.run_command('yes')


#### Creating Arguments String

In [24]:
#Model parameters
batch_size = 128
epochs = 1
lr = 0.01
momentum = 0.5
log_interval = 50
num_processes = 2
nnet_arch = 'LeNet'
dataset = 'cifar10'
args.is_redis = True


In [25]:
hosts = ''
for ip in private_ip_list:
    hosts = hosts+ip+','
hosts = hosts.strip(',')

In [26]:
#dataset can be 'MNIST' or 'cifar10'
#architecture can be 'LeNet' or 'ResNet'(still working on this)
#num_processes should be either 1 or 2
#batch size 128,256,512,1024,2048

arg_str = '--is-redis='+str(args.is_redis)+' --dataset='+dataset+' --nnet-arch='+nnet_arch+' --num-processes='+\
str(num_processes) + ' --batch-size='+str(batch_size) +' --lr='+str(lr) + ' --hosts='+hosts +' --epochs='+str(epochs)

print(arg_str)

--is-redis=True --dataset=cifar10 --nnet-arch=LeNet --num-processes=2 --batch-size=128 --lr=0.01 --hosts=10.0.0.12 --epochs=1


#### Chose the log file name

In [27]:
if args.is_redis:
    log_file = dataset+'-'+nnet_arch+'-'+str(batch_size)+'-'+str(num_processes)+'-'+ 'redis'
else:
    log_file = dataset+'-'+nnet_arch+'-'+str(batch_size)+'-'+str(num_processes)
print('Log file name: '+log_file)

Log file name: cifar10-LeNet-128-2-redis


In [28]:
out = run_command(clients[0],'w')[0].split(',')[1].strip().split(' ')[0]

In [29]:
count = 0
running = True
for sess in tsess:
    sess.run_command('source activate largescale')
    sess.run_command('python3 -u ~/vanilla-hogwild/main.py '+arg_str+ '2>&1 | tee '+log_file)
#     sess.run_command('exit')

# sleep(200)
# while(running):
#     sleep(100)
#     for client in clients:
#         try:
#             if int(run_command(clients[0],'w')[0].split(',')[1].strip().split(' ')[0]) > 3:
#                 continue
#             else:
#                 count = count+1
#         except Exception:
#             print(Exception)
#         finally:
#             clients = get_ssh_clients(instances)
#         if count==len(clients):
#             running = False
#             break

# if not os.path.exists('~/results_lsml'):
#     os.makedirs('~/results_lsml')
# for client in clients:
#     sftp = client.open_sftp()
#     localpath = '.'
#     remotepath = '/'+log_file
#     sftp.put(localpath, remotepath)
#     sftp.close()
#     ssh.close()
# for instance in instances:
#     print(instance.terminate())
# for ip in public_ip_list:
#     ec2c.release_address(PublicIp=ip)

In [30]:
for ip in private_ip_list:
    redis.run_command('. ~/miniconda3/bin/activate largescale &&  ~/miniconda3/envs/largescale/bin/redis-cli -h '+ str(ip)+ 'flushall'))


SyntaxError: invalid syntax (<ipython-input-30-678e2187106f>, line 2)

In [None]:
clients = get_ssh_clients(instances)

In [None]:
run_command(clients[0],'sudo tmux new-session -s sr -d')

In [None]:
print(srujith)