## Test pipeline to convert SVG to DST(csv) file 

Preprocess the dataset:
1. Convert all svg tags to path tags
2. convert all curves and to cubic bezier curves
3. Train a model to predict the DST file from the SVG file

Preprocess the dataset: done separately for now

Data Prep

In [1]:
INPUT_DIR = "D:/PES/Internship-CDSAML/Data/Numbers_Data/Modified_one-100_SVG"
OUTPUT_DIR = "D:/PES/Internship-CDSAML/Data/Numbers_Data/one-100 DST"

In [2]:
import os
from bs4 import BeautifulSoup
import tqdm

In [3]:
def extract_paths(svg_file):
  """Extracts paths from an SVG file using BeautifulSoup.

  Args:
      svg_file (str): Path to the SVG file.

  Returns:
      list: List of extracted paths as strings.
  """
  with open(svg_file, 'r') as f:
    svg_data = f.read()

  soup = BeautifulSoup(svg_data, 'xml')

  paths = []
  # Find all 'path' elements
  for path in soup.find_all('path'):
    paths.append(path['d'])  # Extract the 'd' attribute containing the path data

  return paths

In [4]:
def get_file_paths(folder_path):
    file_paths = []
    # Walk through the directory tree
    for root, directories, files in os.walk(folder_path):
        for filename in files:
            # Construct the absolute path to the file
            file_path = os.path.join(root, filename)
            # Append the file path to the list
            file_paths.append(file_path)
    return file_paths

In [5]:
input_files = get_file_paths(INPUT_DIR)
print(input_files[0])

D:/PES/Internship-CDSAML/Data/Numbers_Data/Modified_one-100_SVG\Numbers Rich 1_100-00.svg


In [6]:
svg_paths = list()

In [7]:
for file in input_files:
    svg_paths.append(extract_paths(file))

In [8]:
print(svg_paths[0])

['M 0.06 12.57 C 0.06 12.57 30.62 12.57 30.62 12.57 C 30.62 12.57 0.07 63.25 0.07 63.25 C 0.07 63.25 0 63.37 0 63.37 C 0 63.37 0.07 74.2 0.07 74.2 C 0.07 74.2 43.9 74.2 43.9 74.2 C 43.9 74.2 43.9 63.04 43.9 63.04 C 43.9 63.04 13.02 63.04 13.02 63.04 C 13.02 63.04 43.9 12.21 43.9 12.21 C 43.9 12.21 43.9 1.41 43.9 1.41 C 43.9 1.41 0.06 1.41 0.06 1.41 C 0.06 1.41 0.06 12.57 0.06 12.57 C 0.06 12.57 0.06 12.57 0.06 12.57', 'M 54.2 74.2 C 54.2 74.2 94.62 74.2 94.62 74.2 C 94.62 74.2 94.62 63.04 94.62 63.04 C 94.62 63.04 65.36 63.04 65.36 63.04 C 65.36 63.04 65.36 43.34 65.36 43.34 C 65.36 43.34 91.1 43.34 91.1 43.34 C 91.1 43.34 91.1 32.17 91.1 32.17 C 91.1 32.17 65.36 32.17 65.36 32.17 C 65.36 32.17 65.36 12.57 65.36 12.57 C 65.36 12.57 94.62 12.57 94.62 12.57 C 94.62 12.57 94.62 1.41 94.62 1.41 C 94.62 1.41 54.2 1.41 54.2 1.41 C 54.2 1.41 54.2 74.2 54.2 74.2 C 54.2 74.2 54.2 74.2 54.2 74.2', 'M 129.28 1.41 C 129.28 1.41 104.14 1.41 104.14 1.41 C 104.14 1.41 104.14 74.2 104.14 74.2 C 104.14

We need to merge all the paths to one element

In [9]:
for i in range(len(svg_paths)):
    new_path = ""
    for path in svg_paths[i]:
        new_path += path + " "
    svg_paths[i] = new_path
print(svg_paths[0])

M 0.06 12.57 C 0.06 12.57 30.62 12.57 30.62 12.57 C 30.62 12.57 0.07 63.25 0.07 63.25 C 0.07 63.25 0 63.37 0 63.37 C 0 63.37 0.07 74.2 0.07 74.2 C 0.07 74.2 43.9 74.2 43.9 74.2 C 43.9 74.2 43.9 63.04 43.9 63.04 C 43.9 63.04 13.02 63.04 13.02 63.04 C 13.02 63.04 43.9 12.21 43.9 12.21 C 43.9 12.21 43.9 1.41 43.9 1.41 C 43.9 1.41 0.06 1.41 0.06 1.41 C 0.06 1.41 0.06 12.57 0.06 12.57 C 0.06 12.57 0.06 12.57 0.06 12.57 M 54.2 74.2 C 54.2 74.2 94.62 74.2 94.62 74.2 C 94.62 74.2 94.62 63.04 94.62 63.04 C 94.62 63.04 65.36 63.04 65.36 63.04 C 65.36 63.04 65.36 43.34 65.36 43.34 C 65.36 43.34 91.1 43.34 91.1 43.34 C 91.1 43.34 91.1 32.17 91.1 32.17 C 91.1 32.17 65.36 32.17 65.36 32.17 C 65.36 32.17 65.36 12.57 65.36 12.57 C 65.36 12.57 94.62 12.57 94.62 12.57 C 94.62 12.57 94.62 1.41 94.62 1.41 C 94.62 1.41 54.2 1.41 54.2 1.41 C 54.2 1.41 54.2 74.2 54.2 74.2 C 54.2 74.2 54.2 74.2 54.2 74.2 M 129.28 1.41 C 129.28 1.41 104.14 1.41 104.14 1.41 C 104.14 1.41 104.14 74.2 104.14 74.2 C 104.14 74.2 11

In [10]:
import pyembroidery as pyemb

In [11]:
def read_dst_commands(dst_file):
    pattern = pyemb.read(dst_file)
    return pattern.stitches

In [12]:
output_files = get_file_paths(OUTPUT_DIR)

In [13]:
print(output_files[0])

D:/PES/Internship-CDSAML/Data/Numbers_Data/one-100 DST\00-ZERO.DST


In [14]:
stitches = list()
for file in output_files:
    stitches.append(read_dst_commands(file))

In [15]:
print(stitches[0]) 

[[-66, 19, 1], [-132, 38, 1], [-198, 57, 1], [-264, 76, 1], [-330, 95, 1], [-396, 115, 0], [-369, 114, 0], [-344, 113, 0], [-318, 112, 0], [-292, 111, 0], [-266, 110, 0], [-293, 110, 0], [-319, 111, 0], [-345, 111, 0], [-371, 111, 0], [-397, 111, 0], [-394, 91, 0], [-380, 69, 0], [-365, 46, 0], [-351, 23, 0], [-336, 1, 0], [-320, -24, 0], [-304, -49, 0], [-288, -73, 0], [-272, -98, 0], [-274, -109, 0], [-299, -109, 0], [-325, -109, 0], [-350, -109, 0], [-375, -109, 0], [-400, -108, 0], [-400, -93, 0], [-403, -86, 0], [-403, -116, 0], [-403, -129, 0], [-403, -86, 0], [-399, -129, 0], [-399, -86, 0], [-395, -129, 0], [-395, -86, 0], [-391, -129, 0], [-391, -86, 0], [-387, -129, 0], [-387, -86, 0], [-383, -129, 0], [-383, -86, 0], [-379, -129, 0], [-379, -86, 0], [-375, -129, 0], [-375, -86, 0], [-371, -129, 0], [-371, -86, 0], [-367, -129, 0], [-367, -86, 0], [-363, -129, 0], [-363, -86, 0], [-359, -129, 0], [-359, -86, 0], [-355, -129, 0], [-355, -86, 0], [-351, -129, 0], [-351, -86, 0]

need to merge all the paths to one element

In [16]:
string_list = [','.join(map(str, sublist)) for sublist in stitches[0]]
print(string_list)

['-66,19,1', '-132,38,1', '-198,57,1', '-264,76,1', '-330,95,1', '-396,115,0', '-369,114,0', '-344,113,0', '-318,112,0', '-292,111,0', '-266,110,0', '-293,110,0', '-319,111,0', '-345,111,0', '-371,111,0', '-397,111,0', '-394,91,0', '-380,69,0', '-365,46,0', '-351,23,0', '-336,1,0', '-320,-24,0', '-304,-49,0', '-288,-73,0', '-272,-98,0', '-274,-109,0', '-299,-109,0', '-325,-109,0', '-350,-109,0', '-375,-109,0', '-400,-108,0', '-400,-93,0', '-403,-86,0', '-403,-116,0', '-403,-129,0', '-403,-86,0', '-399,-129,0', '-399,-86,0', '-395,-129,0', '-395,-86,0', '-391,-129,0', '-391,-86,0', '-387,-129,0', '-387,-86,0', '-383,-129,0', '-383,-86,0', '-379,-129,0', '-379,-86,0', '-375,-129,0', '-375,-86,0', '-371,-129,0', '-371,-86,0', '-367,-129,0', '-367,-86,0', '-363,-129,0', '-363,-86,0', '-359,-129,0', '-359,-86,0', '-355,-129,0', '-355,-86,0', '-351,-129,0', '-351,-86,0', '-347,-129,0', '-347,-86,0', '-343,-129,0', '-343,-86,0', '-339,-129,0', '-339,-86,0', '-335,-129,0', '-335,-86,0', '-331,

In [17]:
for i in tqdm.tqdm(range(len(stitches))):
    stitches[i] = [','.join(map(str, sublist)) for sublist in stitches[i]]
    stitches[i] = ' '.join(stitches[i])

print(stitches[0])

100%|██████████| 101/101 [00:00<00:00, 720.38it/s]

-66,19,1 -132,38,1 -198,57,1 -264,76,1 -330,95,1 -396,115,0 -369,114,0 -344,113,0 -318,112,0 -292,111,0 -266,110,0 -293,110,0 -319,111,0 -345,111,0 -371,111,0 -397,111,0 -394,91,0 -380,69,0 -365,46,0 -351,23,0 -336,1,0 -320,-24,0 -304,-49,0 -288,-73,0 -272,-98,0 -274,-109,0 -299,-109,0 -325,-109,0 -350,-109,0 -375,-109,0 -400,-108,0 -400,-93,0 -403,-86,0 -403,-116,0 -403,-129,0 -403,-86,0 -399,-129,0 -399,-86,0 -395,-129,0 -395,-86,0 -391,-129,0 -391,-86,0 -387,-129,0 -387,-86,0 -383,-129,0 -383,-86,0 -379,-129,0 -379,-86,0 -375,-129,0 -375,-86,0 -371,-129,0 -371,-86,0 -367,-129,0 -367,-86,0 -363,-129,0 -363,-86,0 -359,-129,0 -359,-86,0 -355,-129,0 -355,-86,0 -351,-129,0 -351,-86,0 -347,-129,0 -347,-86,0 -343,-129,0 -343,-86,0 -339,-129,0 -339,-86,0 -335,-129,0 -335,-86,0 -331,-129,0 -331,-86,0 -327,-129,0 -327,-86,0 -323,-129,0 -323,-86,0 -319,-129,0 -319,-86,0 -315,-129,0 -315,-101,0 -311,-129,0 -314,-86,0 -307,-129,0 -313,-90,0 -303,-129,0 -309,-102,0 -299,-129,0 -312,-86,0 -295,-12




Dump the svg paths and the stiches to a pickle file to be used for training the model later if needed

In [18]:
import pickle

In [19]:
with open("D:/PES/Internship-CDSAML/Data/Numbers_Data/Test Pipeline/svg_paths.pkl", "wb") as f:  # "wb" for binary write mode
  pickle.dump(svg_paths, f)
  
with open("D:/PES/Internship-CDSAML/Data/Numbers_Data/Test Pipeline/stitches.pkl", "wb") as f:  # "wb" for binary write mode
  pickle.dump(stitches, f)

Tokenize the SVG paths and DST commands

In [20]:
from transformers import BartTokenizer

In [21]:
# Load BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

In [22]:
# Tokenize each string in the list
tokenized_svg = list()
for string in tqdm.tqdm(svg_paths):
    tokens = tokenizer(string, return_tensors="pt", padding = 'max_length', truncation=True, max_length=2048)
    tokenized_svg.append(tokens)

100%|██████████| 101/101 [00:05<00:00, 17.69it/s]


In [23]:
tokenized_stitches = list()
for string in tqdm.tqdm(stitches):
    tokens = tokenizer(string, return_tensors="pt",padding = 'max_length', truncation=True, max_length=2048)
    tokenized_stitches.append(tokens)

100%|██████████| 101/101 [00:04<00:00, 20.88it/s]


In [24]:
print(tokenized_svg[0])

{'input_ids': tensor([[  0, 448, 321,  ...,   1,   1,   1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


In [25]:
print(tokenized_stitches[0])

{'input_ids': tensor([[   0,   12, 4280,  ...,    6,  288,    2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}


In [26]:
from transformers import BartForConditionalGeneration, AdamW
import torch




In [27]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

In [31]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
batch_size = 4



In [37]:
# # Training loop
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0
    
#     for i in range(0, len(tokenized_svg), batch_size):
#         inputs = tokenized_svg[i:i+batch_size]
#         outputs = tokenized_stitches[i:i+batch_size]
        
#         input_ids = torch.stack([inp['input_ids'] for inp in inputs])
#         print(input_ids.shape, end="\n")
#         attention_mask = torch.stack([inp['attention_mask'] for inp in inputs])
#         print(attention_mask.shape, end="\n")
#         labels = torch.stack([output['input_ids'] for output in outputs])
        
#         optimizer.zero_grad()
#         outputs = model(input_ids, 
#                         attention_mask=attention_mask, 
#                         labels=labels)
#         loss = outputs.loss
#         total_loss += loss.item()
        
#         loss.backward()
#         optimizer.step()
    
#     average_loss = total_loss / len(tokenized_svg)
#     print(f"Epoch {epoch+1}, Average Loss: {average_loss}")

# # Save the trained model
# model.save_pretrained("D:/PES/Internship-CDSAML/Data/Numbers_Data/Test Pipeline/modelv1.pt")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for i in range(0, len(tokenized_svg), batch_size):
        inputs = tokenized_svg[i:i+batch_size]
        outputs = tokenized_stitches[i:i+batch_size]
        
        input_ids = torch.stack([inp['input_ids'] for inp in inputs])
        attention_mask = torch.stack([inp['attention_mask'] for inp in inputs])
        
        # Correcting the shape of attention_mask if necessary
        if attention_mask.dim() == 3 and attention_mask.shape[1] == 1:
            attention_mask = attention_mask.squeeze(1)

        labels = torch.stack([output['input_ids'] for output in outputs])
        
        optimizer.zero_grad()
        outputs = model(input_ids, 
                        attention_mask=attention_mask, 
                        labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    average_loss = total_loss / len(tokenized_svg)
    print(f"Epoch {epoch+1}, Average Loss: {average_loss}")

: 