In [5]:
import os
import pandas as pd

def combine_csv_files(folder_path, output_file):
    """
    Combines all CSV files in a given folder into a single CSV file.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        output_file (str): Path to the output CSV file (e.g., "thanos.csv").
    """
    # Initialize an empty list to hold dataframes
    combined_data = []

    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Append the dataframe to the list
            combined_data.append(df)

    # Concatenate all dataframes in the list into a single DataFrame
    combined_df = pd.concat(combined_data, ignore_index=True)
    
    # Save the combined dataframe to a single CSV file
    combined_df.to_csv(output_file, index=False)

# Example usage
folder_path = "outsourced"  # Replace with the actual folder path
output_file = "thanos.csv"
combine_csv_files(folder_path, output_file)


In [6]:
import pandas as pd

def sort_and_remove_duplicates(csv_file):
    """
    Opens the given CSV file, sorts it based on the first column, 
    removes duplicates based on the first column, and saves the cleaned file.

    Args:
        csv_file (str): Path to the CSV file (e.g., "thanos.csv").
    """
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Sort by the first column (assumed to be 'index') and remove duplicates
    df_sorted = df.sort_values(by=df.columns[0]).drop_duplicates(subset=df.columns[0])

    # Save the sorted and de-duplicated DataFrame back to the CSV file
    df_sorted.to_csv(csv_file, index=False)

# Example usage
csv_file = "thanos.csv"
sort_and_remove_duplicates(csv_file)


In [7]:
import pandas as pd

def merge_csvs(csv1_file, csv2_file, output_file):
    """
    Merges two CSV files based on the 'index' column and writes the result to a new CSV file.

    Args:
        csv1_file (str): Path to the first CSV file (with 'generated_output').
        csv2_file (str): Path to the second CSV file (with 'image_link', 'group_id', 'entity_name').
        output_file (str): Path to the output CSV file.
    """
    # Load the two CSV files into pandas DataFrames
    df1 = pd.read_csv(csv1_file)
    df2 = pd.read_csv(csv2_file)

    # Merge the two DataFrames on the 'index' column
    merged_df = pd.merge(df1, df2, on='index', how='inner')

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)

# Example usage
csv1_file = "infinity.csv"  # Replace with the path to your first CSV
csv2_file = "resources/dataset/test.csv"  # Replace with the path to your second CSV
output_file = "resultant.csv"  # Output file path

merge_csvs(csv1_file, csv2_file, output_file)
