In [25]:
import pandas as pd

# --- Configuration ---
# Replace 'your_file.csv' with the actual name of your input file
input_filename = 'shadow_width_comparison_simple.csv' 
# The new file where the sorted and filtered data will be saved
output_filename = 'filtered_and_sorted_data.csv' 

# Column for filtering (e.g., the second column, which is index 1)
filter_column_index = 1
# The value to compare against for filtering
threshold_value = 10

# Column for sorting (e.g., the second column, which is index 1)
sort_column_index = 1
# ---------------------


try:
    # Read the CSV file. We assume the first row is the header.
    df = pd.read_csv(input_filename)

    # STEP 1: Filter the DataFrame.
    # It keeps only the rows where the value in the specified column is less than the threshold.
    # We get the column name using its index.
    filter_column_name = df.columns[filter_column_index]
    df[filter_column_name] = pd.to_numeric(df[filter_column_name], errors='coerce')
    filtered_df = df[df[filter_column_name] < threshold_value].copy()

    # STEP 2: Sort the filtered DataFrame.
    # We get the column name using its index.
    sort_column_name = df.columns[sort_column_index]
    sorted_df = filtered_df.sort_values(by=sort_column_name, ascending=True)

    # --- NEW STEP: Round all numeric columns to 4 decimal places ---
    rounded_df = sorted_df.round(4)

    # STEP 3: Save the final result to a new CSV file.
    # 'index=False' prevents pandas from writing a new index column.
    rounded_df.to_csv(output_filename, index=False)

    print(f"Process complete!")
    print(f"Original number of rows: {len(df)}")
    print(f"Number of rows after filtering: {len(filtered_df)}")
    print(f"Final sorted, filtered, and rounded data saved to '{output_filename}'")

except FileNotFoundError:
    print(f"Error: The file '{input_filename}' was not found.")
except IndexError:
    print(f"Error: A specified column index is out of range. Please check your file and settings.")
except Exception as e:
    print(f"An error occurred: {e}")

Process complete!
Original number of rows: 1000
Number of rows after filtering: 1000
Final sorted, filtered, and rounded data saved to 'filtered_and_sorted_data.csv'


In [24]:
import pandas as pd

# --- Configuration ---
# Replace this with the actual name of your input file
input_filename = 'shadow_width_comparison_simple.csv' 
# The new file where the filtered data will be saved
output_filename = 'filtered_data.csv' 

# The percentage of data to keep, based on the smallest errors.
# For example, 0.90 means keep the 90% of rows with the smallest difference
# and remove the top 10% with the largest difference.
percentile_to_keep = 0.90
# ---------------------

try:
    # Read the CSV file
    df = pd.read_csv(input_filename)

    # STEP 1: Calculate the absolute difference between original and restored
    # Ensure columns are numeric, converting errors to NaN
    df['original_width_mm'] = pd.to_numeric(df['original_width_mm'], errors='coerce')
    df['restored_width_mm'] = pd.to_numeric(df['restored_width_mm'], errors='coerce')

    # Drop rows where conversion to numeric failed
    df.dropna(subset=['original_width_mm', 'restored_width_mm'], inplace=True)

    # Calculate the absolute difference (the error)
    df['difference'] = (df['original_width_mm'] - df['restored_width_mm']).abs()

    # STEP 2: Find the error threshold based on the percentile
    error_threshold = df['difference'].quantile(percentile_to_keep)

    # STEP 3: Filter the DataFrame to keep only rows below the threshold
    filtered_df = df[df['difference'] <= error_threshold].copy()

    # STEP 4: Remove the temporary 'difference' column
    filtered_df = filtered_df.drop(columns=['difference'])

    # STEP 5: Save the final result to a new CSV file
    filtered_df.to_csv(output_filename, index=False)

    print(f"Process complete!")
    print(f"Original number of rows: {len(df)}")
    print(f"Kept the {percentile_to_keep*100:.0f}% of rows with the smallest difference (error <= {error_threshold:.4f}).")
    print(f"Number of rows after filtering: {len(filtered_df)}")
    print(f"Final filtered data saved to '{output_filename}'")

except FileNotFoundError:
    print(f"Error: The file '{input_filename}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Process complete!
Original number of rows: 1000
Kept the 90% of rows with the smallest difference (error <= 0.3197).
Number of rows after filtering: 900
Final filtered data saved to 'filtered_data.csv'


In [23]:
import pandas as pd
from reportlab.lib.pagesizes import letter, landscape
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle
from reportlab.lib import colors

# Load your CSV
df = pd.read_csv("filtered_data.csv")

# Convert DataFrame to list of lists (for reportlab Table)
data = [df.columns.tolist()] + df.values.tolist()

# Create PDF file
pdf_file = "crack_data.pdf"
pdf = SimpleDocTemplate(pdf_file, pagesize=landscape(letter))

# Create table
table = Table(data)

# Add style
style = TableStyle([
    ('BACKGROUND', (0,0), (-1,0), colors.grey),
    ('TEXTCOLOR',(0,0),(-1,0),colors.whitesmoke),
    ('ALIGN',(0,0),(-1,-1),'CENTER'),
    ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
    ('BOTTOMPADDING', (0,0), (-1,0), 8),
    ('GRID', (0,0), (-1,-1), 0.25, colors.black),
])
table.setStyle(style)

# Build PDF
pdf.build([table])

print(f"PDF saved as {pdf_file}")


PDF saved as crack_data.pdf
