In [1]:
import fitz  # PyMuPDF
import pandas as pd
import re
import os

# Path to the Latin America PDF file
pdf_path = r"C:\Users\clint\Desktop\RER\data\Remittance_3\Remitscope_LatAm_central_bank.pdf"

# Check if file exists
if os.path.exists(pdf_path):
    print(f"PDF found: {pdf_path}")
else:
    print(f"PDF not found at: {pdf_path}")
    print("Please check the file path.")

PDF found: C:\Users\clint\Desktop\RER\data\Remittance_3\Remitscope_LatAm_central_bank.pdf


In [2]:
import camelot
import pandas as pd
import re

# Try to extract tables using Camelot from pages 1-7
try:
    # Extract tables from first seven pages
    tables = camelot.read_pdf(pdf_path, pages='1-7', flavor='stream')
    print(f"Number of tables found: {len(tables)}")
    
    if len(tables) > 0:
        for i, table in enumerate(tables):
            print(f"\nTable {i+1}:")
            print(f"Shape: {table.df.shape}")
            print(table.df.head(10))
            print("="*50)
    else:
        print("No tables found with Camelot")
        
except Exception as e:
    print(f"Error with Camelot: {e}")
    print("Let's try manual parsing...")

Number of tables found: 7

Table 1:
Shape: (90, 6)
                                                   0                 1     2  \
0  Indicator:\tB\t-\tCentral\tBank\tremittance\ti...                           
1                                   Sending\tcountry  Receive\tcountry  Date   
2                                        Afghanistan           Ecuador  2021   
3                                                               Mexico  2022   
4                                                               Panama  2022   
5                                            Albania           Ecuador  2021   
6                                                               Mexico  2022   
7                                                               Panama  2022   
8                                    American\tSamoa           Ecuador  2021   
9                                             Angola            Brazil  2022   

             3              4                    5  
0              

In [3]:
# Test both lattice and stream approaches for LatAm PDF
import os

# Create output directory for LatAm data if it doesn't exist
output_dir = r"C:\Users\clint\Desktop\RER\_output\data\Remittance_3\LatAm"
os.makedirs(output_dir, exist_ok=True)

print("Testing both Camelot approaches for LatAm PDF: lattice and stream")
print("="*60)

# Approach 1: Stream flavor (better for tables without clear borders)
print("\n1. TESTING STREAM FLAVOR (Pages 1-7)")
print("-" * 30)
try:
    tables_stream = camelot.read_pdf(pdf_path, pages='1-7', flavor='stream')
    print(f"Stream - Number of tables found: {len(tables_stream)}")
    
    if len(tables_stream) > 0:
        for i, table in enumerate(tables_stream):
            print(f"\nStream Table {i+1}:")
            print(f"Shape: {table.df.shape}")
            print(f"Accuracy: {table.accuracy:.2f}")
            print("Preview (first 5 rows):")
            print(table.df.head())
            
            # Save to CSV
            csv_filename = f"latam_stream_table_{i+1}.csv"
            csv_path = os.path.join(output_dir, csv_filename)
            table.df.to_csv(csv_path, index=False)
            print(f"Saved to: {csv_path}")
            print("-" * 40)
    else:
        print("No tables found with stream flavor")
        
except Exception as e:
    print(f"Error with stream flavor: {e}")

print("\n" + "="*60)

# Approach 2: Lattice flavor (better for tables with clear borders)
print("\n2. TESTING LATTICE FLAVOR (Pages 1-7)")
print("-" * 30)
try:
    tables_lattice = camelot.read_pdf(pdf_path, pages='1-7', flavor='lattice')
    print(f"Lattice - Number of tables found: {len(tables_lattice)}")
    
    if len(tables_lattice) > 0:
        for i, table in enumerate(tables_lattice):
            print(f"\nLattice Table {i+1}:")
            print(f"Shape: {table.df.shape}")
            print(f"Accuracy: {table.accuracy:.2f}")
            print("Preview (first 5 rows):")
            print(table.df.head())
            
            # Save to CSV
            csv_filename = f"latam_lattice_table_{i+1}.csv"
            csv_path = os.path.join(output_dir, csv_filename)
            table.df.to_csv(csv_path, index=False)
            print(f"Saved to: {csv_path}")
            print("-" * 40)
    else:
        print("No tables found with lattice flavor")
        
except Exception as e:
    print(f"Error with lattice flavor: {e}")

print("\n" + "="*60)
print("SUMMARY:")
print(f"Output directory: {output_dir}")
print("Check the CSV files to compare the results from both approaches.")

Testing both Camelot approaches for LatAm PDF: lattice and stream

1. TESTING STREAM FLAVOR (Pages 1-7)
------------------------------
Stream - Number of tables found: 7

Stream Table 1:
Shape: (90, 6)
Accuracy: 99.90
Preview (first 5 rows):
                                                   0                 1     2  \
0  Indicator:\tB\t-\tCentral\tBank\tremittance\ti...                           
1                                   Sending\tcountry  Receive\tcountry  Date   
2                                        Afghanistan           Ecuador  2021   
3                                                               Mexico  2022   
4                                                               Panama  2022   

             3              4                    5  
0                                                   
1        Value           Unit               Source  
2   0.03859066  USD\tmillions  Central\tBank\tData  
3     0.295915  USD\tmillions  Central\tBank\tData  
4  0.0003874

In [4]:
# Load and clean the extracted LatAm data
print("ANALYZING EXTRACTED LATAM DATA")
print("="*50)

# Check for all extracted files in the LatAm directory
import glob

stream_files = glob.glob(os.path.join(output_dir, "latam_stream_table_*.csv"))
lattice_files = glob.glob(os.path.join(output_dir, "latam_lattice_table_*.csv"))

print(f"Found {len(stream_files)} stream tables and {len(lattice_files)} lattice tables")

# Function to clean the dataframes
def clean_latam_remittance_data(df):
    """Clean the extracted LatAm remittance data"""
    clean_df = df.copy()
    
    # Find where the actual data starts (after header rows)
    data_start = 0
    for i, row in clean_df.iterrows():
        # Look for common indicators of data start in remittance tables
        row_str = ' '.join([str(cell) for cell in row if pd.notna(cell)])
        if any(keyword in row_str.lower() for keyword in ['sending', 'country', 'recipient', 'remittance', 'million']):
            # Check if this looks like a header row
            if any(header in row_str.lower() for header in ['sending country', 'recipient country', 'remittances']):
                data_start = i + 1
                break
    
    if data_start > 0:
        clean_df = clean_df.iloc[data_start:].reset_index(drop=True)
    
    # Remove empty rows
    clean_df = clean_df.dropna(how='all').reset_index(drop=True)
    
    # Remove rows that are mostly empty (less than 2 non-null values)
    clean_df = clean_df[clean_df.count(axis=1) >= 2].reset_index(drop=True)
    
    return clean_df

# Process stream files
for i, file_path in enumerate(stream_files):
    print(f"\nProcessing Stream Table {i+1}:")
    df = pd.read_csv(file_path)
    print(f"Raw shape: {df.shape}")
    print("Raw data preview:")
    print(df.head())
    
    # Clean the data
    clean_df = clean_latam_remittance_data(df)
    print(f"\nCleaned shape: {clean_df.shape}")
    print("Cleaned data preview:")
    print(clean_df.head())
    
    # Save cleaned data
    clean_filename = f"latam_stream_table_{i+1}_cleaned.csv"
    clean_path = os.path.join(output_dir, clean_filename)
    clean_df.to_csv(clean_path, index=False)
    print(f"Cleaned data saved to: {clean_path}")
    print("-" * 50)

# Process lattice files if any
for i, file_path in enumerate(lattice_files):
    print(f"\nProcessing Lattice Table {i+1}:")
    df = pd.read_csv(file_path)
    print(f"Raw shape: {df.shape}")
    print("Raw data preview:")
    print(df.head())
    
    # Clean the data
    clean_df = clean_latam_remittance_data(df)
    print(f"\nCleaned shape: {clean_df.shape}")
    print("Cleaned data preview:")
    print(clean_df.head())
    
    # Save cleaned data
    clean_filename = f"latam_lattice_table_{i+1}_cleaned.csv"
    clean_path = os.path.join(output_dir, clean_filename)
    clean_df.to_csv(clean_path, index=False)
    print(f"Cleaned data saved to: {clean_path}")
    print("-" * 50)

print(f"\nAll LatAm files saved in: {output_dir}")

ANALYZING EXTRACTED LATAM DATA
Found 7 stream tables and 0 lattice tables

Processing Stream Table 1:
Raw shape: (90, 6)
Raw data preview:
                                                   0                 1     2  \
0  Indicator:\tB\t-\tCentral\tBank\tremittance\ti...               NaN   NaN   
1                                   Sending\tcountry  Receive\tcountry  Date   
2                                        Afghanistan           Ecuador  2021   
3                                                NaN            Mexico  2022   
4                                                NaN            Panama  2022   

             3              4                    5  
0          NaN            NaN                  NaN  
1        Value           Unit               Source  
2   0.03859066  USD\tmillions  Central\tBank\tData  
3     0.295915  USD\tmillions  Central\tBank\tData  
4  0.000387472  USD\tmillions                 INEC  

Cleaned shape: (89, 6)
Cleaned data preview:
                

In [5]:
# Summary of LatAm extraction results
print("LATAM PDF EXTRACTION SUMMARY")
print("="*50)
print("🇱🇦 Latin America Central Bank PDF Analysis")
print("📄 Source: Remitscope_LatAm_central_bank.pdf")
print("📊 Pages analyzed: 1-7")
print()

# Count the extracted files
stream_count = len(glob.glob(os.path.join(output_dir, "latam_stream_table_*.csv")))
lattice_count = len(glob.glob(os.path.join(output_dir, "latam_lattice_table_*.csv")))

print("✅ Extraction Results:")
print(f"   • Stream flavor: Found {stream_count} tables")
print(f"   • Lattice flavor: Found {lattice_count} tables")
print()

if stream_count > 0:
    print("📊 Stream Tables Summary:")
    for i in range(stream_count):
        file_path = os.path.join(output_dir, f"latam_stream_table_{i+1}.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            print(f"   • Table {i+1}: {df.shape[0]} rows × {df.shape[1]} columns")

if lattice_count > 0:
    print("📊 Lattice Tables Summary:")
    for i in range(lattice_count):
        file_path = os.path.join(output_dir, f"latam_lattice_table_{i+1}.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            print(f"   • Table {i+1}: {df.shape[0]} rows × {df.shape[1]} columns")

print()
print("💾 Files created in output directory:")
print("   • Raw extracted tables (latam_stream_table_*.csv)")
print("   • Cleaned tables (latam_stream_table_*_cleaned.csv)")
if lattice_count > 0:
    print("   • Lattice tables (latam_lattice_table_*.csv)")
print()
print(f"📁 Location: {output_dir}")
print()
print("🎯 Next Steps:")
print("   1. Review extracted tables for data quality")
print("   2. Combine tables if they contain related data")
print("   3. Analyze remittance flows for Latin America region")

LATAM PDF EXTRACTION SUMMARY
🇱🇦 Latin America Central Bank PDF Analysis
📄 Source: Remitscope_LatAm_central_bank.pdf
📊 Pages analyzed: 1-7

✅ Extraction Results:
   • Stream flavor: Found 14 tables
   • Lattice flavor: Found 0 tables

📊 Stream Tables Summary:
   • Table 1: 90 rows × 6 columns
   • Table 2: 92 rows × 6 columns
   • Table 3: 91 rows × 6 columns
   • Table 4: 91 rows × 6 columns
   • Table 5: 90 rows × 6 columns
   • Table 6: 91 rows × 6 columns
   • Table 7: 76 rows × 6 columns

💾 Files created in output directory:
   • Raw extracted tables (latam_stream_table_*.csv)
   • Cleaned tables (latam_stream_table_*_cleaned.csv)

📁 Location: C:\Users\clint\Desktop\RER\_output\data\Remittance_3\LatAm

🎯 Next Steps:
   1. Review extracted tables for data quality
   2. Combine tables if they contain related data
   3. Analyze remittance flows for Latin America region
   • Table 7: 76 rows × 6 columns

💾 Files created in output directory:
   • Raw extracted tables (latam_stream_table

In [8]:
# DETAILED ANALYSIS OF LATAM EXTRACTED DATA
print("DETAILED LATAM DATA ANALYSIS")
print("="*60)

# Load and compare the first extracted table (if available)
stream_files = glob.glob(os.path.join(output_dir, "latam_stream_table_*.csv"))

if len(stream_files) > 0:
    # Load first table
    first_table_path = stream_files[0]
    df_raw = pd.read_csv(first_table_path)
    
    # Load cleaned version
    clean_table_path = first_table_path.replace('.csv', '_cleaned.csv')
    if os.path.exists(clean_table_path):
        df_clean = pd.read_csv(clean_table_path)
    else:
        df_clean = clean_latam_remittance_data(df_raw)
    
    print("📊 COMPARISON: RAW vs CLEANED DATA")
    print("-" * 40)
    print("RAW DATA (First 10 rows):")
    print(df_raw.head(10))
    print(f"Raw Shape: {df_raw.shape}")
    print()
    
    print("CLEANED DATA (First 10 rows):")
    print(df_clean.head(10))
    print(f"Cleaned Shape: {df_clean.shape}")
    print()
    
    print("🔍 DATA QUALITY ANALYSIS:")
    print("-" * 40)
    print(f"1. Rows removed during cleaning: {df_raw.shape[0] - df_clean.shape[0]}")
    print(f"2. Columns in dataset: {df_clean.shape[1]}")
    print(f"3. Non-null values per column:")
    for i, col in enumerate(df_clean.columns):
        non_null_count = df_clean[col].count()
        print(f"   Column {i+1}: {non_null_count}/{len(df_clean)} ({non_null_count/len(df_clean)*100:.1f}%)")
    
    print()
    print("📈 SAMPLE DATA PREVIEW:")
    print("-" * 40)
    print("Middle section of cleaned data:")
    mid_point = len(df_clean) // 2
    print(df_clean.iloc[mid_point:mid_point+5])
    
    print()
    print("📋 COLUMN ANALYSIS:")
    print("-" * 40)
    for i, col in enumerate(df_clean.columns):
        sample_values = df_clean[col].dropna().head(3).tolist()
        print(f"Column {i+1} sample values: {sample_values}")

else:
    print("❌ No tables were extracted from the LatAm PDF")
    print("This could mean:")
    print("   • The PDF doesn't contain extractable tables")
    print("   • Tables are in image format (need OCR)")
    print("   • Different extraction parameters needed")
    print("   • Pages 1-7 don't contain the tables")

print()
print("🎯 RECOMMENDATIONS:")
print("-" * 40)
if len(stream_files) > 0:
    print("✅ Extraction successful! Consider:")
    print("   • Validating country names and remittance values")
    print("   • Checking for currency information")
    print("   • Combining multiple tables if they're related")
    print("   • Creating visualizations for regional analysis")
else:
    print("⚠️ Extraction needs adjustment:")
    print("   • Try different page ranges")
    print("   • Test with different camelot parameters")
    print("   • Consider alternative extraction methods")
    print("   • Check if PDF is text-based or image-based")

DETAILED LATAM DATA ANALYSIS
📊 COMPARISON: RAW vs CLEANED DATA
----------------------------------------
RAW DATA (First 10 rows):
                                                   0                 1     2  \
0  Indicator:\tB\t-\tCentral\tBank\tremittance\ti...               NaN   NaN   
1                                   Sending\tcountry  Receive\tcountry  Date   
2                                        Afghanistan           Ecuador  2021   
3                                                NaN            Mexico  2022   
4                                                NaN            Panama  2022   
5                                            Albania           Ecuador  2021   
6                                                NaN            Mexico  2022   
7                                                NaN            Panama  2022   
8                                    American\tSamoa           Ecuador  2021   
9                                             Angola            Brazil