In [2]:
import pandas as pd
import numpy as np
import random

## Adding Comments

In [5]:
half_adder_1_bit = """
module half_adder (
    input a, b,
    output sum, carry
);
    assign sum = a ^ b;
    assign carry = a & b;
endmodule
"""

full_adder_1_bit = """
module full_adder (
    input a, b, cin,
    output sum, carry
);
    assign sum = a ^ b ^ cin;
    assign carry = (a & b) | (b & cin) | (a & cin);
endmodule
"""

full_adder_32_bit = """
module full_adder_32 (
    input [31:0] a, b,
    input cin,
    output [31:0] sum,
    output carry
);
    wire [31:0] carry_out;
    assign carry_out[0] = cin;

    genvar i;
    generate
        for (i = 0; i < 32; i = i + 1) begin: full_adders
            full_adder fa (
                .a(a[i]),
                .b(b[i]),
                .cin(carry_out[i]),
                .sum(sum[i]),
                .carry(carry_out[i + 1])
            );
        end
    endgenerate
    assign carry = carry_out[32];
endmodule
"""

subtractor_32_bit = """
module subtractor_32 (
    input [31:0] a, b,
    output [31:0] diff,
    output borrow
);
    wire [31:0] b_complement = ~b + 1; // 2's complement of b
    wire [31:0] temp_sum;
    full_adder_32 fa32 (
        .a(a),
        .b(b_complement),
        .cin(1'b0),
        .sum(temp_sum),
        .carry(borrow)
    );
    assign diff = temp_sum;
endmodule
"""

multiplier_1_bit = """
module multiplier_1 (
    input a, b,
    output product
);
    assign product = a & b;
endmodule
"""

multiplier_32_bit = """
module multiplier_32 (
    input [31:0] a, b,
    output [63:0] product
);
    assign product = a * b;
endmodule
"""

and_gate = """
module and_1 (
    input a, b,
    output result
);
    assign result = a & b;
endmodule
"""

or_gate = """
module or_1 (
    input a, b,
    output result
);
    assign result = a | b;
endmodule
"""

not_gate = """
module not_1 (
    input a,
    output result
);
    assign result = ~a;
endmodule
"""

xor_gate = """
module xor_1 (
    input a, b,
    output result
);
    assign result = a ^ b;
endmodule
"""

nand_gate = """
module nand_1 (
    input a, b,
    output result
);
    assign result = ~(a & b);
endmodule
"""

nor_gate = """
module nor_1 (
    input a, b,
    output result
);
    assign result = ~(a | b);
endmodule
"""

mux_1to2 = """
module mux_1to2 (
    input d0, d1, sel,
    output y
);
    assign y = sel ? d1 : d0;
endmodule
"""

mux_1to4 = """
module mux_4to1 (
    input [3:0] d,
    input [1:0] sel,
    output y
);
    assign y = d[sel];
endmodule
"""

mux_1to8 = """
module mux_8to1 (
    input [7:0] d,
    input [2:0] sel,
    output y
);
    assign y = d[sel];
endmodule
"""

mux_1to16 = """
module mux_16to1 (
    input [15:0] d,
    input [3:0] sel,
    output y
);
    assign y = d[sel];
endmodule
"""

mux_1to32 = """
module mux_32to1 (
    input [31:0] d,
    input [4:0] sel,
    output y
);
    assign y = d[sel];
endmodule
"""

mux_1to64 = """
module mux_64to1 (
    input [63:0] d,
    input [5:0] sel,
    output y
);
    assign y = d[sel];
endmodule
"""

demux_2to1 = """
module demux_1to2 (
    input d, sel,
    output y0, y1
);
    assign y0 = ~sel & d;
    assign y1 = sel & d;
endmodule
"""

demux_4to1 = """
module demux_1to4 (
    input d,
    input [1:0] sel,
    output [3:0] y
);
    assign y = (1 << sel) & {4{d}};
endmodule
"""

demux_8to1 = """
module demux_1to8 (
    input d,
    input [2:0] sel,
    output [7:0] y
);
    assign y = (1 << sel) & {8{d}};
endmodule
"""

demux_16to1 = """
module demux_1to16 (
    input d,
    input [3:0] sel,
    output [15:0] y
);
    assign y = (1 << sel) & {16{d}};
endmodule
"""

demux_32to1 = """
module demux_1to32 (
    input d,
    input [4:0] sel,
    output [31:0] y
);
    assign y = (1 << sel) & {32{d}};
endmodule
"""

demux_64to1 = """
module demux_1to64 (
    input d,
    input [5:0] sel,
    output [63:0] y
);
    assign y = (1 << sel) & {64{d}};
endmodule
"""

In [16]:
def comment_lines(input_string):
    """
    Adds '//' at the beginning of random lines in the input string
    """
    lines = input_string.split('\n')
    total_lines = len(lines)
    
    num_comments = random.randint(0, total_lines//2)
    
    for i in range(num_comments):
        line_no = random.randint(0, total_lines-1)
        lines[line_no] = '//' + lines[line_no]
    
    commented_string = '\n'.join(lines)    
    return commented_string

print(f'{comment_lines(half_adder_1_bit)}')


module half_adder (
    input a, b,
//    output sum, carry
);
//    assign sum = a ^ b;
    assign carry = a & b;
endmodule



In [42]:
data = pd.read_csv('/kaggle/input/llm-design-data/df_small.csv')
print(len(data))

L = [half_adder_1_bit, full_adder_1_bit, full_adder_32_bit, subtractor_32_bit, multiplier_1_bit, multiplier_32_bit, and_gate, or_gate, not_gate, 
     xor_gate, nand_gate, nor_gate, mux_1to2, mux_1to4, mux_1to8, mux_1to16, mux_1to32, mux_1to64, demux_2to1, demux_4to1, demux_8to1, demux_16to1, demux_32to1, 
     demux_64to1]

for i in range(2):
    for string in L:
        for j in range(10):
            new_string = comment_lines(string)
            new_df = pd.DataFrame(columns=['Correct','Error'])
            new_df['Correct'] = [string]
            new_df['Error'] = [new_string]
            data = pd.concat([data, new_df], ignore_index=True)
        
print(len(data))
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv('commented.csv', index=False)

980


## Changing variable names

In [55]:
import re
import random

def randomize_variable_names(verilog_code, change_probability=0.5):
    variables = re.findall(r'\b\w+\b', verilog_code)

    keywords = {'module', 'input', 'output', 'wire', 'reg', 'assign', 'always', 'begin', 'end'}
    variables = [var for var in variables if var not in keywords and not var.isdigit()]
    unique_variables = set(variables)

    changed_variables = {}
    for var in unique_variables:
        new_var = var + str(random.randint(0, 9))
        changed_variables[var] = new_var

    def random_replace(match):
        var = match.group(0)
        if var in changed_variables and random.random() < change_probability:
            return changed_variables[var]
        return var

    randomized_verilog_code = re.sub(r'\b\w+\b', random_replace, verilog_code)

    return randomized_verilog_code


L = [half_adder_1_bit, full_adder_1_bit, full_adder_32_bit, subtractor_32_bit, multiplier_1_bit, multiplier_32_bit, and_gate, or_gate, not_gate, 
     xor_gate, nand_gate, nor_gate, mux_1to2, mux_1to4, mux_1to8, mux_1to16, mux_1to32, mux_1to64, demux_2to1, demux_4to1, demux_8to1, demux_16to1, demux_32to1, 
     demux_64to1]

for i in range(2):
    for string in L:
        for j in range(10):
            new_string = randomize_variable_names(string)
            new_df = pd.DataFrame(columns=['Correct','Error'])
            new_df['Correct'] = [string]
            new_df['Error'] = [new_string]
            data = pd.concat([data, new_df], ignore_index=True)
        
print(len(data))
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv('commented.csv', index=False)

1460


## Operator errors

In [73]:
def randomize_operators(verilog_code, change_probability=0.5):
    operators = ['\+', '-', '\*', '/', '%', '&', '\|', '\^', '!', '~', '&&', '\|\|', '==', '!=', '<', '<=', '>', '>=']
    operator_replacements = {
        '+': ['-', '*', '/', '%', '&&', '^'],
        '-': ['+', '*', '/', '%', '&&', '^'],
        '*': ['+', '-', '/', '%', '&&', '^'],
        '/': ['+', '-', '*', '%', '&&', '^'],
        '%': ['+', '-', '*', '/', '&&', '^'],
        '&': ['&', '|', '+', '*', '/', '-'],
        '|': ['&', '|', '+', '*', '/', '-'],
        '^': ['&', '|', '+', '*', '/', '-'],
        '!': ['&', '|', '+', '*', '/', '-'],
        '~': ['&', '|', '+', '*', '/', '-'],
        '&&': ['||', '%%'],
        '||': ['&&', '%%'],
        '==': ['!=', '<', '<=', '>', '>='],
        '!=': ['==', '<', '<=', '>', '>='],
        '<': ['<=', '>', '>='],
        '<=': ['<', '>', '>='],
        '>': ['<', '<=', '>='],
        '>=': ['<', '<=', '>']
    }

    def random_replace_operator(match):
        op = match.group(0)
        if random.random() < change_probability:
            possible_replacements = operator_replacements.get(op, [])
            if possible_replacements:
                return random.choice(possible_replacements)
        return op

    pattern = re.compile('|'.join(operators))
    randomized_verilog_code = pattern.sub(random_replace_operator, verilog_code)
    return randomized_verilog_code


L = [half_adder_1_bit, full_adder_1_bit, full_adder_32_bit, subtractor_32_bit, multiplier_1_bit, multiplier_32_bit, and_gate, or_gate, not_gate, 
     xor_gate, nand_gate, nor_gate, mux_1to2, mux_1to4, mux_1to8, mux_1to16, mux_1to32, mux_1to64, demux_2to1, demux_4to1, demux_8to1, demux_16to1, demux_32to1, 
     demux_64to1]

for i in range(2):
    for string in L:
        for j in range(10):
            new_string = randomize_operators(string)
            new_df = pd.DataFrame(columns=['Correct','Error'])
            new_df['Correct'] = [string]
            new_df['Error'] = [new_string]
            data = pd.concat([data, new_df], ignore_index=True)
        
print(len(data))
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv('small_df.csv', index=False)

1940


In [74]:
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv('small_df.csv', index=False)

## 

## Formatting the dataset

In [5]:
data = pd.read_csv('../data/small_df.csv')
print(len(data))
data.head(2)

1940


Unnamed: 0,Correct,Error
0,"\nmodule not_1 (\n input a,\n output res...","\nmodule not_1 (\n input a,\n output res..."
1,"\nmodule nand_1 (\n input a, b,\n output...","//\nmodule nand_1 (\n input a, b,\n outp..."


In [7]:
base_prompt = """\nBASE PROMPT: You are an expert in Verilog code generation and code correction. Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not write any explanation after the code."""

instruction = """\nINSTRUCT: Correct the logic and syntax of the following Verilog code. Check and correct any instances of wrong or missing commented lines, variable names, module names, and operators."""

end = """CODE: \n"""

In [8]:
data['Base Prompt'] = [base_prompt for i in range(len(data))]
data['Instruction'] = [instruction for i in range(len(data))]
data['End'] = [end for i in range(len(data))]

data.head(2)

Unnamed: 0,Correct,Error,Base Prompt,Instruction,End
0,"\nmodule not_1 (\n input a,\n output res...","\nmodule not_1 (\n input a,\n output res...",\nBASE PROMPT: You are an expert in Verilog co...,\nINSTRUCT: Correct the logic and syntax of th...,CODE: \n
1,"\nmodule nand_1 (\n input a, b,\n output...","//\nmodule nand_1 (\n input a, b,\n outp...",\nBASE PROMPT: You are an expert in Verilog co...,\nINSTRUCT: Correct the logic and syntax of th...,CODE: \n


In [9]:
data = data[['Base Prompt', 'Instruction', 'Error', 'End', 'Correct']]
data.head(2)

Unnamed: 0,Base Prompt,Instruction,Error,End,Correct
0,\nBASE PROMPT: You are an expert in Verilog co...,\nINSTRUCT: Correct the logic and syntax of th...,"\nmodule not_1 (\n input a,\n output res...",CODE: \n,"\nmodule not_1 (\n input a,\n output res..."
1,\nBASE PROMPT: You are an expert in Verilog co...,\nINSTRUCT: Correct the logic and syntax of th...,"//\nmodule nand_1 (\n input a, b,\n outp...",CODE: \n,"\nmodule nand_1 (\n input a, b,\n output..."
