# FUNCTION GENERATOR using Policy Gradient

Useful links:
Policy Gradient Explanation: http://karpathy.github.io/2016/05/31/rl/ <br>
Example of Policy Gradient: https://github.com/keon/policy-gradient

In [1]:
import numpy as np
from keras.models import Sequential, Model
from keras.layers import TimeDistributed, Dense, Reshape, Flatten, GRU, Input, Embedding, LSTM
from keras.optimizers import Adam
from keras.layers.convolutional import Convolution2D
from PolicyGradientModel import PolicyGradientModel
from RewardCalculator import RewardCalculator

Using TensorFlow backend.


In [13]:
ALLOWED_PARAMETERS = list('XY')
ALLOWED_SYMBOLS = ALLOWED_PARAMETERS + list('0123456789+-*/#')
NUM_SYMBOLS = len(ALLOWED_SYMBOLS)
MAX_LENGTH = 10 # Max length of the output expression
CORRECT_EXPRESSION = "3*X+2*Y"

### DEFINE MODEL

In [14]:
def getModel():
    # Trying to neglect input
    input1 = Input(batch_shape=(1,MAX_LENGTH,1))
    # TODO: Add noise layer to make output vary
    x = LSTM(512, return_sequences=True)(input1)
    x = LSTM(512, return_sequences=True)(input1)
    x = LSTM(512, return_sequences=True)(input1)
    x = Dense(100, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    out = Dense(NUM_SYMBOLS, activation='softmax')(x)
    
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(lr=0.00001),
                loss='categorical_crossentropy')
    return model

In [4]:
model = getModel()
model.predict(np.zeros((1,MAX_LENGTH,1)))

array([[[ 0.05882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353,
          0.05882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353,
          0.05882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353,
          0.05882353,  0.05882353]]], dtype=float32)

In [5]:
model.predict(np.zeros((1,MAX_LENGTH,1)))

array([[[ 0.05882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353,
          0.05882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353,
          0.05882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353,
          0.05882353,  0.05882353]]], dtype=float32)

In [6]:
X = np.zeros((1,MAX_LENGTH,1))
Y = np.ones((1,MAX_LENGTH,NUM_SYMBOLS))
model.train_on_batch(X, Y)

48.164627

In [4]:
rewardCalculator = RewardCalculator(correctExpression=CORRECT_EXPRESSION,
                                    parameters=ALLOWED_PARAMETERS,
                                    functionDifferenceRewardWeight=0.0,
                                    compilableRewardWeight=0.60, 
                                    lengthRewardWeight=-0.01,
                                    foundMathSymbolWeight=0.1,
                                    foundVariableWeight=0.5,
                                    rewardOffset=-0.7,
                                    usingFile=False)

In [15]:
model = getModel()
pgModel = PolicyGradientModel(model=model,
                              allowedSymbol=ALLOWED_SYMBOLS,
                              numSymbol=NUM_SYMBOLS,
                              maxLength=MAX_LENGTH,
                              rewardCalculator=rewardCalculator,
                              learningRate=0.0000000000001,
                              fileName="State_test_3_forced.hdf5")

In [16]:
#pgModel.loadWeight("State_test_2.hdf5")
pgModel.loadWeight("State_test_2_forced.hdf5")

In [17]:
pgModel.train(input=np.ones((1,1,1)))

Epoch: 0	Loss: 2.47103006631	Example Output: 2*XX3-8Y+2	Example Reward:  1
Prob
[  2.55333573e-01   2.53931075e-01   4.91455384e-02   4.90979664e-02
   4.91366573e-02   4.88683693e-02   4.93287630e-02   4.91089933e-02
   4.87434193e-02   4.88245562e-02   4.90750298e-02   4.94060889e-02
   4.89241009e-11   4.85153237e-11   4.45415684e-11   4.76276137e-11
   4.27490994e-11]
[  3.52182750e-09   3.48363316e-09   1.25044277e-02   1.25958892e-02
   1.25582498e-02   1.24609377e-02   1.26955332e-02   1.25244837e-02
   1.24948546e-02   1.25658521e-02   1.28065264e-02   1.25212139e-02
   1.81918472e-01   1.83945790e-01   1.81688726e-01   1.82060972e-01
   1.44658104e-01]
[ 0.19724341  0.19276561  0.04793767  0.04779501  0.04697831  0.04722007
  0.04747212  0.04734098  0.04670906  0.04767372  0.04811729  0.04782397
  0.02694512  0.02709053  0.0268984   0.02703024  0.02695851]
[ 0.16418417  0.16730429  0.04818559  0.06011325  0.05997161  0.05650619
  0.0529634   0.04672733  0.05469271  0.05399899 

Epoch: 51	Loss: 1.93701172054	Example Output: X+23*Y-9-*	Example Reward:  1
Epoch: 52	Loss: 1.85429149002	Example Output: 6*Y/+**3X-	Example Reward:  1
Epoch: 53	Loss: 1.87836688608	Example Output: Y1X+/40+*1	Example Reward:  -0.3
Epoch: 54	Loss: 1.96584148526	Example Output: Y/X1++	Example Reward:  1
Epoch: 55	Loss: 1.69349134147	Example Output: X	Example Reward:  1
Epoch: 56	Loss: 1.88596393049	Example Output: Y*XY-/*/X5	Example Reward:  1
Epoch: 57	Loss: 1.72434827805	Example Output: Y	Example Reward:  1
Epoch: 58	Loss: 1.87543080777	Example Output: X**6Y55*/+	Example Reward:  1
Epoch: 59	Loss: 1.81462081045	Example Output: 5	Example Reward:  1
Epoch: 60	Loss: 1.82124833703	Example Output: Y//X1+88X	Example Reward:  1
Saving Weight


KeyboardInterrupt: 

In [None]:
for i in range(100):
    seq, _ = pgModel.predictOutputSequence(input=np.ones((1,1,1)))
    temp = np.array(ALLOWED_SYMBOLS)
    print(''.join(temp[seq]))