Skip to content

Commit

Permalink
July 2021 release
Browse files Browse the repository at this point in the history
Co-authored-by: Kaidi Xu <xu.kaid@husky.neu.edu>
Co-authored-by: Zhouxing Shi <zhouxingshichn@gmail.com>
Co-authored-by: Yihan Wang <wangyihan617@gmail.com>
Co-authored-by: Shiqi Wang <tcwangshiqi@cs.columbia.edu>
  • Loading branch information
5 people committed Aug 2, 2021
1 parent c8935c6 commit 319252b
Show file tree
Hide file tree
Showing 58 changed files with 5,119 additions and 1,242 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ __pycache__
dist
*.swp
*.log
.trace_graph
.trace_graph
8 changes: 6 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
language: python
python:
- "3.7"
- "3.8"
install:
- pip install --editable .
- cd examples
- pip install -r requirements.txt
- pip install torchvision==0.6.0 torch==1.7.0
- cd ..
- sudo fallocate -l 16G /swapfile
- sudo chmod 600 /swapfile
- sudo mkswap /swapfile
- sudo swapon /swapfile
- free -h
script:
- cd tests
- python utils/download_models.py
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright 2020 Kaidi Xu, Zhouxing Shi, Huan Zhang
Copyright 2021 Kaidi Xu, Zhouxing Shi, Huan Zhang, Yihan Wang, Shiqi Wang

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Expand Down
388 changes: 117 additions & 271 deletions README.md

Large diffs are not rendered by default.

179 changes: 179 additions & 0 deletions auto_LiRPA/adam_element_lr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import torch
import math
from torch.optim.optimizer import Optimizer
from torch import Tensor
from typing import List, Optional


def adam(params: List[Tensor],
grads: List[Tensor],
exp_avgs: List[Tensor],
exp_avg_sqs: List[Tensor],
max_exp_avg_sqs: List[Tensor],
state_steps: List[int],
*,
amsgrad: bool,
beta1: float,
beta2: float,
lr: float,
weight_decay: float,
eps: float,
lr_scale: Optional[Tensor],
batch_dim: Optional[int]):
r"""Functional API that performs Adam algorithm computation.
See :class:`~torch.optim.Adam` for details.
"""

for i, param in enumerate(params):

grad = grads[i]
exp_avg = exp_avgs[i]
exp_avg_sq = exp_avg_sqs[i]
step = state_steps[i]

bias_correction1 = 1 - beta1 ** step
bias_correction2 = 1 - beta2 ** step

if weight_decay != 0:
grad = grad.add(param, alpha=weight_decay)

# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)

step_size = lr / bias_correction1
if lr_scale is not None:
# Per batch element learning rate scaler.
# We must know which dimension is corresponding to batch and broadcast accordingly.
total_dim = exp_avg.ndim
new_shape = (1, ) * batch_dim + (lr_scale.size(0), ) + (1, ) * (total_dim - 1 - batch_dim)
scaler = lr_scale.view(*new_shape)
param.addcdiv_(scaler * exp_avg, denom, value=-step_size)
else:
param.addcdiv_(exp_avg, denom, value=-step_size)
if lr_scale is not None:
pass
# print('lr scaler', lr_scale)


class AdamElementLR(Optimizer):
r"""Implements Adam algorithm, with the capability of setting different lr
per batch element.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
The implementation of the L2 penalty follows changes proposed in
`Decoupled Weight Decay Regularization`_.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _Decoupled Weight Decay Regularization:
https://arxiv.org/abs/1711.05101
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(AdamElementLR, self).__init__(params, defaults)

def __setstate__(self, state):
super(Adam, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)

@torch.no_grad()
def step(self, lr_scale=None, closure=None):
"""Performs a single optimization step.
Args:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

for i, group in enumerate(self.param_groups):
params_with_grad = []
grads = []
exp_avgs = []
exp_avg_sqs = []
max_exp_avg_sqs = []
state_steps = []
beta1, beta2 = group['betas']

for p in group['params']:
if p.grad is not None:
params_with_grad.append(p)
if p.grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
grads.append(p.grad)

state = self.state[p]
# Lazy state initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
if group['amsgrad']:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

exp_avgs.append(state['exp_avg'])
exp_avg_sqs.append(state['exp_avg_sq'])

if group['amsgrad']:
max_exp_avg_sqs.append(state['max_exp_avg_sq'])

# update the steps for each param group update
state['step'] += 1
# record the step after step update
state_steps.append(state['step'])

adam(params_with_grad,
grads,
exp_avgs,
exp_avg_sqs,
max_exp_avg_sqs,
state_steps,
amsgrad=group['amsgrad'],
beta1=beta1,
beta2=beta2,
lr=group['lr'],
weight_decay=group['weight_decay'],
eps=group['eps'],
lr_scale=lr_scale[i] if lr_scale is not None else None,
batch_dim=group['batch_dim'],
)
return loss
Loading

0 comments on commit 319252b

Please sign in to comment.