In [None]:
import pandas as pd

input_csv = 'TAGS_noCategorizeds.csv'  
categories_txt = 'Cat_TAGs.txt'  
output_csv = 'categorized_artifacts.csv' 


def load_categories(txt_file):
    with open(txt_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    category_mapping = {}
    current_category = None
    
    for line in lines:
        line = line.strip()
        if line.endswith(':'):
            current_category = line[:-1]  
        elif line.startswith('#') and current_category:
            tag = line.strip()
            category_mapping[tag] = current_category  
    
    return category_mapping

category_mapping = load_categories(categories_txt)

data = pd.read_csv(input_csv)

def map_tags_to_categories(tags):
    categories = []
    for tag in tags:
        if tag in category_mapping:
            categories.append(category_mapping[tag])
    return categories

data['CategoryList'] = data['TAG'].apply(lambda x: map_tags_to_categories(eval(x)))

data = data.explode('CategoryList')

data = data.rename(columns={'CategoryList': 'Category'})

grouped = data.groupby('ArtifactID')

results = []

for artifact, group in grouped:
    if not group['Category'].empty:
        mode_value = group['Category'].mode()
        if not mode_value.empty:
            results.append({'ArtifactID': artifact, 'Category': mode_value[0]})

result_df = pd.DataFrame(results)

result_df.to_csv(output_csv, index=False)
print(f"File generated successfully: {output_csv}")


In [None]:
import csv
from collections import Counter

def contar_artifacts_por_categoria(input_file, output_file):
    categorias = []
    
    with open(input_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  
        for row in reader:
            if row:  
                categoria = row[1]  
                categorias.append(categoria)
    
    categoria_contagem = Counter(categorias)
    
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['category', 'amount'])
        for categoria, numero in categoria_contagem.most_common():
            writer.writerow([categoria, numero])


input_file = 'categorized_artifacts_oficial_V1.csv'
output_file = 'category_count.csv'

contar_artifacts_por_categoria(input_file, output_file)

print(f"File generated successfully: {output_file}")


In [None]:
import csv
import random

# Function to calculate percentage and perform stratified sampling
def analyze_and_sample(input_file, output_file, sample_size):
    categories = []
    category_count = {}
    total_count = 0

    # Read the input CSV and calculate total and category counts
    with open(input_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            # Skip rows that are empty or malformed
            if len(row) < 2:  # Check if the row has at least two columns
                continue
            category = row[0]  # category is the first column
            try:
                count = int(row[1])  # Try to convert the second column to an integer
            except ValueError:
                print(f"Skipping row with invalid number: {row}")
                continue  # Skip this row if the amount is not a valid integer
            
            total_count += count
            category_count[category] = count
            categories.append((category, count))

    # Check if total_count is zero to avoid division by zero
    if total_count == 0:
        print("Error: Total count of artifacts is zero.")
        return

    # Print the total count of artifacts
    print(f"Total count of artifacts: {total_count}")

    # Calculate the percentage for each category
    category_percentages = {category: (count / total_count) * 100 for category, count in category_count.items()}

    # Print the percentages for each category
    print("\nPercentages by category:")
    for category, percentage in category_percentages.items():
        print(f"{category}: {percentage:.2f}%")

    # Stratified sampling
    stratified_sample = []
    print("\nPerforming stratified sampling:")
    for category, count in categories:
        sample_size_category = round((count / total_count) * sample_size)  # Proportional sample size
        stratified_sample.extend([category] * sample_size_category)  # Add category to sample
        print(f"Category: {category} | Count: {count} | Sample size: {sample_size_category}")

    # Randomly sample from the stratified list
    random.shuffle(stratified_sample)  # Shuffle to randomize the selection
    sample = stratified_sample[:sample_size]

    # Write the results (percentage and stratified sample) to a new CSV
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['category', 'amount', 'percentage', 'sampled'])  # Header
        for category in category_count:
            writer.writerow([category, category_count[category], category_percentages[category], category in sample])

    print(f"\nAnalysis complete. Results saved to {output_file}")

# Input file with category and number of artifacts
input_file = 'categoria_count.csv'  # Replace with your CSV file path
output_file = 'analyzed_and_sampled.csv'  # Output CSV file path
sample_size = 378  # Sample size to select

analyze_and_sample(input_file, output_file, sample_size)

In [None]:
import csv
import random

# Function to perform stratified sampling and generate a new CSV with selected artifacts
def stratified_sampling(input_file, output_file, sample_size, category_samples):
    artifacts = []

    # Read the input CSV and prepare a list of artifacts with their categories
    with open(input_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            artifact = row[0]  # Artifact is the first column
            category = row[1]  # Category is the second column
            artifacts.append((artifact, category))

    # Prepare a dictionary to store artifacts by category
    category_artifacts = {category: [] for category in category_samples}

    # Organize artifacts by category
    for artifact, category in artifacts:
        if category in category_artifacts:
            category_artifacts[category].append(artifact)

    # Prepare the list of sampled artifacts
    sampled_artifacts = []

    # Perform stratified sampling: For each category, randomly select artifacts
    for category, sample_count in category_samples.items():
        if category in category_artifacts:
            available_artifacts = category_artifacts[category]
            sampled = random.sample(available_artifacts, min(sample_count, len(available_artifacts)))  # Randomly sample
            for artifact in sampled:
                sampled_artifacts.append((artifact, category))

    # Write the selected sampled artifacts to a new CSV file
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['artifact', 'category'])  # Header
        for artifact, category in sampled_artifacts:
            writer.writerow([artifact, category])

    print(f"Stratified sampling complete. {len(sampled_artifacts)} artifacts selected and saved to {output_file}")

# Input and output files
input_file = 'categorized_artifacts_oficial_V1.csv'  # Replace with your CSV file path containing categorized artifacts
output_file = 'stratified_sampled_artifacts.csv'  # Output CSV file path

# Stratified sample sizes based on the example you provided
category_samples = {
    "Others": 85,
    "Software Construction": 71,
    "Computing Foundations": 67,
    "Software Architecture": 44,
    "Software Configuration Management": 33,
    "Software Quality": 16,
    "Software Engineering Operations": 13,
    "Software Engineering Management": 12,
    "Software Utilities": 10,
    "Software Design": 8,
    "Software Security": 7,
    "Mathematical Foundations": 6,
    "Software Domain": 4,
    "Software Engineering Economics": 1,
    "Software Architeture": 1,
    "Software Maintenance": 0
}

sample_size = sum(category_samples.values())  # Total number of artifacts to sample

# Perform the stratified sampling and save the result to the output CSV
stratified_sampling(input_file, output_file, sample_size, category_samples)


In [None]:
import csv
import random

# Function to randomly select 5 rows from each category
def random_sample_per_category(input_file, output_file, sample_size=5):
    category_artifacts = {}

    # Read the input CSV and group artifacts by category
    with open(input_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            artifact = row[0]  # Artifact is the first column
            category = row[1]  # Category is the second column
            if category not in category_artifacts:
                category_artifacts[category] = []
            category_artifacts[category].append(artifact)

    # Prepare the sampled rows
    sampled_rows = []
    for category, artifacts in category_artifacts.items():
        if len(artifacts) <= sample_size:
            # If there are fewer than the sample size, take all available
            sampled = artifacts
        else:
            # Otherwise, randomly sample the specified number
            sampled = random.sample(artifacts, sample_size)
        # Add the sampled artifacts with their category to the sampled_rows list
        sampled_rows.extend([(artifact, category) for artifact in sampled])

    # Write the sampled rows to the output CSV
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['artifact', 'category'])  # Header
        writer.writerows(sampled_rows)

    print(f"Random sampling complete. {len(sampled_rows)} rows saved to {output_file}")

# Input and output file paths
input_file = 'categorized_artifacts_oficial_V1.csv'  # Replace with your CSV file path
output_file = 'random_sampled_per_category.csv'  # Output CSV file path

# Perform random sampling
random_sample_per_category(input_file, output_file)