### Libraries

In [None]:
import torch  
import random
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import genpareto

### Actor Critic 

In [7]:
class ActorCritic(nn.Module):
    def __init__(self, learning_rate=3e-4):
        super(ActorCritic, self).__init__()

        self.num_inputs=2 
        self.num_actions = 5
        self.critic_linear1 = nn.Linear(self.num_inputs, 12)
        self.critic_linear2 = nn.Linear(12, 120)
        self.critic_linear3 = nn.Linear(120, 12)        
        self.critic_linear4 = nn.Linear(12, 1)

        self.actor_linear1 = nn.Linear(self.num_inputs, 12)
        self.actor_linear2 = nn.Linear(12, 12)
        #self.actor_linear3 = nn.Linear(120, 12)
        self.actor_linear4 = nn.Linear(12, self.num_actions)
   
    def forward(self, state):
        state = Variable(torch.from_numpy(state).float())
        value = F.relu(self.critic_linear1(state))
        value = F.relu(self.critic_linear2(value))
        value = F.relu(self.critic_linear3(value))                
        value = self.critic_linear4(value)
        
        policy_dist = F.relu(self.actor_linear1(state))
        policy_dist = F.relu(self.actor_linear2(policy_dist))
        #policy_dist = F.relu(self.actor_linear3(policy_dist))                
        policy_dist = F.softmax(self.actor_linear4(policy_dist), dim=1)

        return value, policy_dist      

### Parameters

In [4]:
# NOAA sea level rise predictions
#input 'scale_parameter_gammadstrb'
B= 0.5;

#input 'shape_parameter_gammadstrb'
A= np.zeros((3,100))
A[0,:]=np.arange(11.2,12.388,0.012);# intermediate low rise case
A[1,:]=np.arange(13,35,0.22);# intermediate high rise case
A[2,:]= np.arange(14.6,88.6,0.74);# high rise case


# generalized pareto    
power_l=0.9;
power_s= 0.8;  
eta=250;
k= -0.1;
theta= 14;

#parameters for cost definition
beta=14 ; #coefficient of residents' investment decision y_n, residents', contribution of 14M $/yearly, 1M $ taken as unit
alpha=300; # 25m $ is the coefficient for investment cost

### Main Code

In [None]:
%%time
GAMMA = 0.99
a_g= 0.9; ## Government's cooperation index
a_r= 0.9; ## Residents' cooperation index
a=2; ## SLR scenarios; a=0,1, and 2, respectively for intermediate low, intermediate, and high sea level rise projections. 

s_norm=400,# normalozing s
l_norm=[580, 1190,2590];# normalozing s
years = 100
episodes = 1000000

actor_critic = ActorCritic()
ac_optimizer = optim.Adam(actor_critic.parameters(), lr=3e-4)

all_rewards = []
entropy_term = 0
    
for episode in range(episodes):
	log_probs = []; # log probabilities of action
	values = np.zeros(years)
	rewards = np.zeros(years)


	l= np.zeros(101);
	s= np.zeros(101);
	l[0]=100; # initial relative sea level
	s[0]=50; # initial infrastructure state  
	r=0; # sea level rise each year
	q= np.zeros(101);  # yearly residents' decision score          
	sig= np.zeros(101);# yearly sigmoid input for residents' decision           
	res= np.zeros(101); # yearly residents' binary decision 
	x= np.zeros(100); #  yearly govenment's decision         
	z= np.zeros(100); # yearly cost from nature
	state= np.zeros((1,2));
	next_state= np.zeros((1,2));
	for y in range(years):
		state[0,0]= (l[y]-l[0])/l_norm[a];             
		state[0,1]= (s[y]-s[0])/s_norm;             

		value, policy_dist = actor_critic.forward(state)
		values[y] = value.detach().numpy()# converts tensor to array
		dist = policy_dist.detach().numpy() # converts tensor to array

		action =np.random.choice(5, p=np.squeeze(dist)) # random.randrange(5)
		x[y]= action;
		z[y]=genpareto.rvs(k, loc=theta, scale=eta* np.power(l[y],power_l)/np.power(s[y],power_s)); # genpareto, shape,k=-0.1,location,theta=0; scale,sigma                    
		q[y+1]=a_r*(q[y]+ action/4 * z[y]); ## dividing the action by 4 because action can be o~3
		sig[y+1]= 1/(1 + np.exp(-(q[y+1]-5)));
		res[y+1]= np.random.binomial(1, sig[y+1]);  

		r= np.random.gamma(A[a,y],B);                                         
		l[y+1]= l[y]+r;                
		s[y+1]= s[y]+action; 
		next_state[0,0]= (l[y+1]-l[0])/l_norm[a];            
		next_state[0,1]= (s[y+1]-s[0])/s_norm;  
		rewards[y]=alpha*x[y]-beta*res[y]+z[y];
        
		log_prob = torch.log(policy_dist.squeeze(0)[action])
		entropy = -np.sum(np.mean(dist) * np.log(dist))
		log_probs.append(log_prob)
		entropy_term += entropy
		state = next_state
            
	Qval, transition = actor_critic.forward(next_state)
	Qval = Qval.detach().numpy()
	all_rewards.append(np.sum(rewards))
            
        # compute Q values
	Qvals = np.zeros(years)
	for t in reversed(range(years)):
		Qval = -rewards[t] + GAMMA * Qval
		Qvals[t] = Qval
  
        #update actor critic
	values = torch.FloatTensor(values)
	Qvals = torch.FloatTensor(Qvals)
	log_probs = torch.stack(log_probs)
        
	advantage = Qvals - values
	actor_loss = (-log_probs * advantage).mean()
	critic_loss = 0.5 * advantage.pow(2).mean()
	ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

	ac_optimizer.zero_grad() #Sets gradients of all model parameters to zero, needs to be done before "ac_loss.backward()"
	ac_loss.backward()
	ac_optimizer.step()
    
    # Plot results
smoothed_rewards = pd.Series.rolling(pd.Series(all_rewards), 10).mean()
smoothed_rewards = [elem for elem in smoothed_rewards]
plt.plot(all_rewards)
plt.plot(smoothed_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Total Cost in M$')
plt.show()

print("Total cost: {}, final cost: {}" .format(np.sum(all_rewards)/episodes,np.sum(all_rewards[(episodes-10000):episodes])/10000))