# Transformer model for converting English sentences into Flutter UI widgets

In these lines of code, we'll train **Seq2seq Transformer** model to convert natural English sentences into Flutter UI widget code.

Our generated dataset has around 175000 English sentences and around 175000 Flutter widget code

The goal of this experiment is totally for research purposes, We're not going to support all Flutter widgets for the current phase and also we are using some custom Widget instead of the regular widget to make things easier



In [1]:
!git clone https://github.com/TahaDouaji/English-to-Flutter-widget.git

Cloning into 'English-to-Flutter-widget'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 35 (delta 15), reused 25 (delta 9), pack-reused 0[K
Unpacking objects: 100% (35/35), done.


In [2]:
%cd English-to-Flutter-widget/

/content/English-to-Flutter-widget


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator


# Prepare Dataset

In [4]:
import json
dataset = []
with open("data_set.json", 'r') as file:
    data_json = file.read()

    array_of_obj = json.loads(data_json)
    for it in array_of_obj:
        dataset.append(it)

print(len(dataset))

175632


In [5]:
from random import shuffle
shuffle(dataset)

In [6]:
dataset[:10]

[{'sentence': 'Row parent of add a string with font Color Colors.value and fontSize value and value and textAlign TextAlign.value',
  'widget': 'Row( children:[ Text ( fontColor: Colors.value, fontSize: value, value, textAlign: TextAlign.value )  ],)'},
 {'sentence': 'build a label with value and text Align TextAlign.value and font Size value and fontColor Colors.value',
  'widget': 'Text ( value, textAlign: TextAlign.value, fontSize: value, fontColor: Colors.value ) '},
 {'sentence': 'write a title has text Align TextAlign.value and fontSize value and fontColor Colors.value and value inside a Row',
  'widget': 'Row( children:[ Text ( textAlign: TextAlign.value, fontSize: value, fontColor: Colors.value, value )  ],)'},
 {'sentence': 'write a text with text Align TextAlign.value and value and fontSize value and font Color Colors.value',
  'widget': 'Text ( textAlign: TextAlign.value, value, fontSize: value, fontColor: Colors.value ) '},
 {'sentence': 'Column parent of create a container

# Tokenizers

For English sentences, we're going to use Spacy as our input Tokenizer
For the output Tokenizer, we'll build our own custom tokenizer. We didn't find a suitable tokenizer for Dart/Flutter so we'll be using Python's default[tokenize](https://docs.python.org/3/library/tokenize.html) for now and it will be changed later on.

In [7]:
!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [8]:
from tokenize import tokenize, untokenize
import io


def tokenize_flutter_code(str_code):
    fluttere_tokens = list(tokenize(io.BytesIO(str_code.encode('utf-8')).readline))
    return [it.string for it in fluttere_tokens if len(it.string) > 0]

In [9]:
tokenize_flutter_code("Container(color:Color.red)")

['utf-8', 'Container', '(', 'color', ':', 'Color', '.', 'red', ')']

In [10]:
from torchtext.legacy import data
import spacy
import numpy as np
import pandas as pd
import numpy as np
import random
import math
import time

In [11]:
Input = Field(tokenize='spacy',
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

Output = Field(tokenize = tokenize_flutter_code,
                      init_token='<sos>', 
                    eos_token='<eos>', 
                    lower=False)

In [12]:
fields = [('Input', Input),('Output', Output)]

# Create Dataset

In [None]:
dataset_df = pd.DataFrame(dataset)


np.random.seed(0)
msk = np.random.rand(len(dataset_df)) < 0.85


train_df = dataset_df[msk]
val_df = dataset_df[~msk]

val_msk = np.random.rand(len(pd.DataFrame(train_df))) < 0.85
test_df = train_df[~val_msk]
train_df = train_df[val_msk]

SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
train_example = []
val_example = []
test_example = []

train_expansion_factor = 1
for j in range(train_expansion_factor):
  for i in range(train_df.shape[0]):
      try:
          ex = data.Example.fromlist([train_df.sentence[i], train_df.widget[i]], fields)
          train_example.append(ex)
      except:
          pass

for i in range(val_df.shape[0]):
    try:
        ex = data.Example.fromlist([val_df.sentence[i], val_df.widget[i]], fields)
        val_example.append(ex)
    except:
        pass  

for i in range(test_df.shape[0]):
    try:
        ex = data.Example.fromlist([test_df.sentence[i], test_df.widget[i]], fields)
        test_example.append(ex)
    except:
        pass       

In [None]:
len(train_example)

# Creating vocabulary using torchtext

In [14]:
train_data = data.Dataset(train_example, fields)
valid_data =  data.Dataset(val_example, fields)
test_data =  data.Dataset(test_example, fields)

In [15]:
Input.build_vocab(train_data, max_size=10000, min_freq=2)
Output.build_vocab(train_data, max_size=10000, min_freq=2)

# Transformer class model

In [16]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out


In [17]:
from utils import translate_sentence
from utils import bleu
from utils import translate_sentence_with_values
from utils import save_checkpoint

In [18]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = True
save_model = True

# Training hyperparameters
num_epochs = 5
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(Input.vocab)
trg_vocab_size = len(Output.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = Input.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

# Create our iterators using BucketIterator

In [19]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, []), batch_size = batch_size, 
                                                                sort_key = lambda x: len(x.Input),
                                                                sort_within_batch=True, device = device)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training

In [20]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = Input.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

sentence = "Build a container with width 33 and height 44"

In [21]:
num_epochs = 15
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()
    translated_sentence = translate_sentence_with_values(
        model, sentence, Input, Output, device, max_length=50
    )

    print(f"Translated sentence: \n {translated_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.Input.to(device)
        target = batch.Output.to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])
        
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)


        optimizer.step()
        
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

[Epoch 0 / 15]
=> Saving checkpoint
Translated sentence: 
 None


ZeroDivisionError: ignored

In [None]:
score = bleu(test_data[1:500], model, Input, Output, device)
print(f"Bleu score {score * 100:.2f}")

Bleu score 99.06


In [None]:
import re
model.eval()
asks = [
        "create a box with color Color.red",
        'create a text with "this is my text inside row"'
        "draw a box with width 24",
        "build a box with width 22 and height 80",
        "build a box with height 44 and width 12",
        'write a text with "This is me" with color Color.red',
        'build a text with "Hello" and textSize 22',
        "build a box with width value inside center",
]

for it in asks:
  dot_values = []
  for dot_val in re.findall("\.[a-z]+", it):
    dot_values.append(dot_val)
    it = it.replace(dot_val, ".value")
  str_values = []

  pattern = r'"([A-Za-z0-9 ]*)"'
  for str_val in re.findall(pattern, it):
    str_values.append(str_val)
    it = it.replace(str_val, "value")

  code = ''.join(translate_sentence(model, it, Input, Output, device, max_length=50)).replace('<eos>','').replace('utf-8','')
  # print(code)
  for idx, dot_val in enumerate(re.findall(".value", code)):
    if len(dot_values) > idx:
      code = code.replace(".value", dot_values[idx])
  
  for idx, str_val in enumerate(re.findall("value", code)):
    if len(str_values) > idx:
      code = code.replace("value", f'"{str_values[idx]}"')

  print(code)

Container(color:Colors.red)
Container(margin:EdgeInsets.all("this is my text inside row"),margin:EdgeInsets.all("this is my text inside row"))
Container(width:value,height:value)
Container(height:value,width:value)
Container(child:[CustomText("This is me",fontColor:Colors.red)],)
CustomText(fontSize:"Hello","Hello")
Center(child:Container(width:value))


In [None]:
# !cp -i /content/English-to-Flutter-widget/my_checkpoint.pth.tar /content/drive/MyDrive/FlutterGenerator