In [1]:
import json
from product_names import one_word_furniture, two_word_furniture, three_word_furniture


In [2]:
output_file_path = 'data/output.json'
with open(output_file_path, 'r', encoding='utf-8-sig') as json_file:
    output_data = json.load(json_file)

# Extract product names from text_content
def extract_products(text):
    products = []
    words = text.split()
    i = 0
    while i < len(words):
        # Check for 3-word combinations
        if i + 2 < len(words):
            three_word_combo = ' '.join(words[i:i+3]).lower()
            if three_word_combo in three_word_furniture:
                products.append(three_word_combo)
                i += 3
                continue
        # Check for 2-word combinations
        if i + 1 < len(words):
            two_word_combo = ' '.join(words[i:i+2]).lower()
            if two_word_combo in two_word_furniture:
                products.append(two_word_combo)
                i += 2
                continue
        # Check for single words
        word = words[i].lower()
        if word in one_word_furniture:
            products.append(word)
        i += 1
    return products

# List to store modified entries
modified_output_data = []

# Iterate through the entries in  output_data and create a new entry with products
for entry in output_data:
    text_content = entry['text_content']
    products = extract_products(text_content)
    modified_entry = {
        'url': entry['url'],
        'text_content': text_content,
        'products': products
    }
    modified_output_data.append(modified_entry)

# New json file to store url, text_content, products
new_output_file_path = 'data/produse_in_urls.json'
with open(new_output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(modified_output_data, json_file, indent=4)




In [3]:
# Load data from produse_in_urls.json
produse_file_path = 'data/produse_in_urls.json'
with open(produse_file_path, 'r', encoding='utf-8-sig') as json_file:
    produse_data = json.load(json_file)

# Add the corresponding tag top each product
def annotate_products(products):
    annotated_products = []
    for product in products:
        # Split the product into words
        words = product.split()
        if len(words) == 1:
            # If the product consists of only one word, annotate it with B-Product
            annotated_products.append({"name": product, "annotation": "B-Product"})
        else:
            # If the product consists of multiple words, annotate the first word with B-Product and the rest with I-Product
            annotation = "B-Product, " + ", ".join(["I-Product"] * (len(words) - 1))
            annotated_products.append({"name": product, "annotation": annotation})
    return annotated_products

# Loop through each entry in produse_data and add the tags
for entry in produse_data:
    if 'products' in entry:
        entry['products'] = annotate_products(entry['products'])

# json file to have both the products and their annotation for each url text_content
annotated_produse_file_path = 'data/produse_in_urls_annotated.json'
with open(annotated_produse_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(produse_data, json_file, indent=4)


In [4]:

# Load the data from produse_in_urls_annotated.json
input_file_path = 'data/produse_in_urls_annotated.json'
with open(input_file_path, 'r', encoding='utf-8-sig') as json_file:
    data = json.load(json_file)

# Generate tags for all word (per word) in a suitable form to train the NER
def generate_tags(text_content, products):
    words = text_content.split()
    tags = []
    i = 0
    while i < len(words):
        word_matched = False
        for product in products:
            name = product["name"].split()
            if words[i:i+len(name)] == name:
                annotation = product["annotation"].split(', ')
                tags.append("B-" + annotation[0].split('-')[-1])
                tags.extend(["I-" + ann.split('-')[-1] for ann in annotation[1:]])
                word_matched = True
                i += len(name)
                break
        if not word_matched:
            tags.append("O")
            i += 1
    return tags

# List to store modified entries
modified_data = []

# Iterate through each entry in the data and generate tags
for entry in data:
    text_content = entry['text_content']
    products = entry['products']
    tags = generate_tags(text_content, products)
    modified_entry = {
        'url': entry['url'],
        'text_content': text_content,
        'tags': tags
    }
    modified_data.append(modified_entry)


# modified data is stored in a new JSON file
output_file_path = 'data/annotated_output.json'
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(modified_data, json_file, indent=4)




