In [None]:
import pandas as pd

input_csv = 'TAGS_noCategorizeds.csv'  
categories_txt = 'Categorias para TAGs.txt'  
output_csv = 'categorized_artifacts.csv' 


def load_categories(txt_file):
    with open(txt_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    category_mapping = {}
    current_category = None
    
    for line in lines:
        line = line.strip()
        if line.endswith(':'):
            current_category = line[:-1]  
        elif line.startswith('#') and current_category:
            tag = line.strip()
            category_mapping[tag] = current_category  
    
    return category_mapping

category_mapping = load_categories(categories_txt)

data = pd.read_csv(input_csv)

def map_tags_to_categories(tags):
    categories = []
    for tag in tags:
        if tag in category_mapping:
            categories.append(category_mapping[tag])
    return categories

data['CategoryList'] = data['TAG'].apply(lambda x: map_tags_to_categories(eval(x)))

data = data.explode('CategoryList')

data = data.rename(columns={'CategoryList': 'Category'})

grouped = data.groupby('ArtifactID')

results = []

for artifact, group in grouped:
    if not group['Category'].empty:
        mode_value = group['Category'].mode()
        if not mode_value.empty:
            results.append({'ArtifactID': artifact, 'Category': mode_value[0]})

result_df = pd.DataFrame(results)

result_df.to_csv(output_csv, index=False)
print(f"File generated successfully: {output_csv}")


Arquivo gerado com sucesso: categorized_artifacts.csv


In [None]:
import csv
from collections import Counter

def contar_artifacts_por_categoria(input_file, output_file):
    categorias = []
    
    with open(input_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  
        for row in reader:
            if row:  
                categoria = row[1]  
                categorias.append(categoria)
    
    categoria_contagem = Counter(categorias)
    
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['category', 'amount'])
        for categoria, numero in categoria_contagem.most_common():
            writer.writerow([categoria, numero])


input_file = 'categorized_artifacts_oficial_V1.csv'
output_file = 'category_count.csv'

contar_artifacts_por_categoria(input_file, output_file)

print(f"File generated successfully: {output_file}")


In [None]:
import csv
import random

# Function to calculate percentage and perform stratified sampling
def analyze_and_sample(input_file, output_file, sample_size):
    categories = []
    category_count = {}
    total_count = 0

    # Read the input CSV and calculate total and category counts
    with open(input_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            category = row[1]
            count = int(row[2])  # The number of artifacts for that category
            total_count += count
            category_count[category] = count
            categories.append((category, count))

    # Calculate the percentage for each category
    category_percentages = {category: (count / total_count) * 100 for category, count in category_count.items()}

    # Stratified sampling
    stratified_sample = []
    for category, count in categories:
        sample_size_category = round((count / total_count) * sample_size)  # Proportional sample size
        stratified_sample.extend([category] * sample_size_category)  # Add category to sample

    # Randomly sample from the stratified list
    random.shuffle(stratified_sample)  # Shuffle to randomize the selection
    sample = stratified_sample[:sample_size]

    # Write the results (percentage and stratified sample) to a new CSV
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['category', 'numero', 'percentage', 'sampled'])  # Header
        for category in category_count:
            writer.writerow([category, category_count[category], category_percentages[category], category in sample])

    print(f"Analysis complete. Results saved to {output_file}")

# Input file with category and number of artifacts
input_file = 'category_count.csv'  # Replace with your CSV file path
output_file = 'analyzed_and_sampled.csv'  # Output CSV file path
sample_size = 378  # Sample size to select

analyze_and_sample(input_file, output_file, sample_size)


In [None]:
import csv
import random

# Function to randomly select rows based on weighted probabilities
def weighted_random_sample(input_file, output_file, sample_size):
    categories = []
    category_weights = []
    
    # Read the input CSV and prepare the categories and weights
    with open(input_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            category = row[1]
            count = int(row[2])  # The number of artifacts for that category
            categories.append((category, count))
            category_weights.append(count)

    # Randomly select the specified number of rows based on weights
    sampled_categories = random.choices(categories, weights=category_weights, k=sample_size)

    # Write the selected samples to the output CSV
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['category', 'numero'])  # Header
        for category, count in sampled_categories:
            writer.writerow([category, count])

    print(f"Random sampling complete. Results saved to {output_file}")

# Input file with category and number of artifacts
input_file = 'categoria_count.csv'  # Replace with your CSV file path
output_file = 'random_sampled.csv'  # Output CSV file path
sample_size = 378  # Sample size to select

weighted_random_sample(input_file, output_file, sample_size)
