In [1]:
import pandas as pd
import re

def clean_product_announcements(input_file, output_file):
    """
    Clean the product announcements CSV file by removing false positives.
    
    Args:
        input_file (str): Path to the original CSV file
        output_file (str): Path to save the cleaned CSV file
    """
    print(f"Reading file: {input_file}")
    
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Print initial info
    original_count = len(df)
    print(f"Original dataset: {original_count} records")
    
    # 1. Remove entries with empty descriptions
    before_count = len(df)
    df = df.dropna(subset=['product_description'])
    df = df[df['product_description'].str.strip() != '']
    after_count = len(df)
    print(f"Removed {before_count - after_count} entries with empty descriptions")
    
    # 2. Remove entries with generic or non-specific product names
    generic_product_names = [
        'New Product','New Products', 'Acquisition', 'The Offering', 'Fikes Acquisition', 'CryoWorks Acquisition',
        'emolument packages', 'June 18', 'New Brands' , 'New Product Lines' 
    ]
    
    before_count = len(df)
    df = df[~df['new_product'].isin(generic_product_names)]
    after_count = len(df)
    print(f"Removed {before_count - after_count} entries with generic product names")
    
    # 3. Remove specific false positives identified in the analysis
    false_positives = [
        "Eli Lilly", "Ad Tech Platform", "WINREVAIR in the U",
        "VOLT Plating System", "StarCoder2", 
        "Workday, Inc. Executive Severance and Change in Control Policy, as amended", "Private Investment Institute",
        "five-year", "Optical-Connectivity Products for Generative AI", "New Operating Structure",
        "EA SPORTS College Football 25", "Square Kiosk", "QR Code Product", "Repurchase Transaction",
        "AirTouch N30i", "Highly Integrated Dual Eligible Special Needs Plan", "Credit Agreement",
        "convenient, low-cost pet and animal pharmacy solution", "TOP 10","Health Payment Accounts",
        "Botox Cosmetic", "MGM Collection with Marriott Bonvoy", "Secondary Offering","New Product Introductions",
        "Exchange Agreements", "New LNG Agreements", "Three New LNG Agreements"
    ]
    
    before_count = len(df)
    df = df[~df['new_product'].isin(false_positives)]
    after_count = len(df)
    print(f"Removed {before_count - after_count} specific false positives")
    
    # 4. Remove duplicate entries (case-insensitive)
    before_count = len(df)
    
    # Create a lowercase version of the product name for case-insensitive duplicate detection
    df['new_product_lower'] = df['new_product'].str.lower()
    df['company_name_lower'] = df['company_name'].str.lower()
    
    # Sort by filing_time (most recent first) before deduplication to keep newest entries
    df = df.sort_values('filing_time', ascending=False)
    
    # Drop duplicates based on lowercase company and product names
    df = df.drop_duplicates(subset=['company_name_lower', 'new_product_lower'])
    
    # Remove the temporary lowercase columns
    df = df.drop(columns=['new_product_lower', 'company_name_lower'])
    
    after_count = len(df)
    print(f"Removed {before_count - after_count} case-insensitive duplicate entries")
    
    # Sort by filing_time (most recent first)
    df = df.sort_values('filing_time', ascending=False)
    
    # Save the cleaned data to a new CSV file
    print(f"Saving cleaned data to: {output_file}")
    df.to_csv(output_file, index=False)
    
    print("Cleaning complete!")
    print(f"Started with {original_count} records, ended with {len(df)} records")
    print(f"Removed a total of {original_count - len(df)} entries")
    
    return df

if __name__ == "__main__":
    # File paths
    input_file = "product_announcements.csv"
    output_file = "product_announcements_cleaned.csv"
    
    # Clean the data
    cleaned_data = clean_product_announcements(input_file, output_file)

Reading file: product_announcements.csv
Original dataset: 250 records
Removed 15 entries with empty descriptions
Removed 13 entries with generic product names
Removed 27 specific false positives
Removed 60 case-insensitive duplicate entries
Saving cleaned data to: product_announcements_cleaned.csv
Cleaning complete!
Started with 250 records, ended with 135 records
Removed a total of 115 entries
