<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/notebooks/deduplicate_and_summarize_resultsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
from pathlib import Path

def read_tsv_files(comments_path, submissions_path):

    try:
        comments_df = pd.read_csv(comments_path, sep='\t', encoding='utf-8')
        submissions_df = pd.read_csv(submissions_path, sep='\t', encoding='utf-8')

        print(f"Comments dataset shape: {comments_df.shape}")
        print(f"Submissions dataset shape: {submissions_df.shape}")

        return comments_df, submissions_df

    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        return None, None
    except Exception as e:
        print(f"Error reading files: {e}")
        return None, None

In [None]:
def deduplicate_by_body(df, dataset_name):

    initial_count = len(df)

    # Remove duplicates based on 'body' column, keeping first occurrence
    df_dedup = df.drop_duplicates(subset=['body'], keep='first')

    final_count = len(df_dedup)
    removed_count = initial_count - final_count

    print(f"{dataset_name} - Initial records: {initial_count}")
    print(f"{dataset_name} - After deduplication: {final_count}")
    print(f"{dataset_name} - Removed duplicates: {removed_count}")

    return df_dedup

In [None]:
def save_tsv_file(df, output_path, filename):

    try:
        # Create directory if it doesn't exist
        Path(output_path).mkdir(parents=True, exist_ok=True)

        full_path = os.path.join(output_path, filename)
        df.to_csv(full_path, sep='\t', index=False, encoding='utf-8')

        print(f"Successfully saved: {full_path}")

    except Exception as e:
        print(f"Error saving file {filename}: {e}")

In [None]:
def merge_and_analyze_users(comments_df, submissions_df):

    # Merge the datasets
    merged_df = pd.concat([comments_df, submissions_df], ignore_index=True)

    print(f"Merged dataset shape: {merged_df.shape}")

    # Count unique users
    unique_users = merged_df['author'].nunique()
    total_records = len(merged_df)

    # Additional user statistics
    user_stats = {
        'total_records': total_records,
        'unique_users': unique_users,
        'avg_posts_per_user': total_records / unique_users if unique_users > 0 else 0
    }

    # Top contributors analysis
    user_counts = merged_df['author'].value_counts()
    top_contributors = user_counts.head(10)

    print(f"\nUser Analysis:")
    print(f"Total records in merged dataset: {total_records}")
    print(f"Unique users: {unique_users}")
    print(f"Average posts per user: {user_stats['avg_posts_per_user']:.2f}")

    print(f"\nTop 10 contributors:")
    for author, count in top_contributors.items():
        print(f"  {author}: {count} posts")

    return merged_df, unique_users, user_stats

In [None]:
def main():

    # ☆
    # File paths
    comments_path = '/content/drive/MyDrive/world-inflation/result/tsv/food_comments_results_202508.tsv'
    submissions_path = '/content/drive/MyDrive/world-inflation/result/tsv/food_submissions_results.tsv'

    # Specify output directory (modify as needed)
    output_directory = '/content/drive/MyDrive/world-inflation/result/tsv/'

    print("Starting inflation sentiment analysis data processing...")
    print("="*60)

    # Step 1: Read TSV files
    print("\n1. Reading TSV files...")
    comments_df, submissions_df = read_tsv_files(comments_path, submissions_path)

    if comments_df is None or submissions_df is None:
        print("Failed to read input files. Exiting.")
        return

    # Display basic info about the datasets
    print(f"\nDataset columns: {list(comments_df.columns)}")

    # Step 2: Deduplicate datasets
    print("\n2. Deduplicating datasets...")
    comments_dedup = deduplicate_by_body(comments_df, "Comments")
    submissions_dedup = deduplicate_by_body(submissions_df, "Submissions")

    # ☆
    # Step 3: Save deduplicated datasets
    print("\n3. Saving deduplicated datasets...")
    save_tsv_file(comments_dedup, output_directory, 'food_comments_final_results_20260819.tsv')
    save_tsv_file(submissions_dedup, output_directory, 'food_submissions_final_results_20260819.tsv')

    # Step 4: Merge and analyze users
    print("\n4. Merging datasets and analyzing users...")
    merged_df, unique_users, user_stats = merge_and_analyze_users(comments_dedup, submissions_dedup)

    # ☆
    # Save merged dataset
    save_tsv_file(merged_df, output_directory, 'merged_food_results_20260819.tsv')

    # ☆
    # Save user statistics
    # stats_df = pd.DataFrame([user_stats])
    # save_tsv_file(stats_df, output_directory, 'Frugal_users.tsv')

    print("\n" + "="*60)
    print("Data processing completed successfully!")
    print(f"Output directory: {output_directory}")
    print(f"Unique users identified: {unique_users}")

In [None]:
if __name__ == "__main__":
    # Run the main processing pipeline
    main()

Starting inflation sentiment analysis data processing...

1. Reading TSV files...
Comments dataset shape: (95731, 8)
Submissions dataset shape: (1639, 8)

Dataset columns: ['created_date', 'subreddit_id', 'id', 'author', 'parent_id', 'body', 'score', 'inflation']

2. Deduplicating datasets...
Comments - Initial records: 95731
Comments - After deduplication: 95729
Comments - Removed duplicates: 2
Submissions - Initial records: 1639
Submissions - After deduplication: 1639
Submissions - Removed duplicates: 0

3. Saving deduplicated datasets...
Successfully saved: /content/drive/MyDrive/world-inflation/result/tsv/food_comments_final_results_20260819.tsv
Successfully saved: /content/drive/MyDrive/world-inflation/result/tsv/food_submissions_final_results_20260819.tsv

4. Merging datasets and analyzing users...
Merged dataset shape: (97368, 8)

User Analysis:
Total records in merged dataset: 97368
Unique users: 60569
Average posts per user: 1.61

Top 10 contributors:
  FoodMod: 3396 posts
  S