In [1]:
import pandas as pd
import numpy as np
import json
import xml.etree.ElementTree as ET

In [2]:
manufactured = "Ut�ngy�rtott"
manufactured_correct = "Utángyártott"

In [3]:
# Initialize an empty list to hold the JSON objects
data = []

# Read the JSONL file line by line
with open("output.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        try:
            # Parse the JSON object and append to the list
            data.append(json.loads(line))
        except ValueError as e:
            print(f"Error decoding line: {line}\nError: {e}")

# Convert the list of JSON objects to a DataFrame
df = pd.DataFrame(data)

df.drop("url", axis=1, inplace=True)
df.drop("availability", axis=1, inplace=True)

# Add a new attribute named Originality
df['originality'] = df['product_name'].apply(lambda x: 'manufactured' if manufactured in x else 'original')

# Remove "manufactured" from product_name if it exists and handle extra spaces
df['product_name'] = df['product_name'].apply(
    lambda x: x.replace(manufactured, '').replace('  ', ' ').strip() if manufactured in x else x
)


# Display the first few rows of the dataframe
print(df)

     price            competitor product_name originality
0     5685        onlinetoner.hu   HP N9K06AE    original
1     5690             pcland.hu   HP N9K06AE    original
2     6490               alza.hu   HP N9K06AE    original
3     5590         primatinta.hu   HP N9K06AE    original
4     5685        onlinetoner.hu   HP N9K06AE    original
..     ...                   ...          ...         ...
198  19002  kellekanyagonline.hu   HP T6M15AE    original
199  19032            wincity.hu   HP T6M15AE    original
200  19089                pcx.hu   HP T6M15AE    original
201  19100         totalprint.hu   HP T6M15AE    original
202  19209     homeofficeshop.hu   HP T6M15AE    original

[203 rows x 4 columns]


In [5]:
# Drop duplicates based on product_name
unique_products = df.drop_duplicates(subset='product_name')

# Determine the number of unique products
num_unique_products = unique_products.shape[0]

# Sample size should be the lesser of the number of unique products or the desired sample size
sample_size = min(5, num_unique_products)

# Select the sample size of different product names with prices
sample_products = unique_products.sample(n=sample_size, random_state=1)

sample_products = sample_products.drop("competitor")

print(sample_products)

     price      competitor                    product_name originality
0     5685  onlinetoner.hu                      HP N9K06AE    original
120  17461  onlinetoner.hu                      HP T6M15AE    original
60    4990     tonerek.com  Utángyártott Samsung MLT-D116L    original


In [6]:
sample_products.to_xml("top_5_products.xml")