In [1]:
import fitz  # PyMuPDF
import pandas as pd
import re
import os

# Path to the PDF file
pdf_path = r"C:\Users\clint\Desktop\RER\data\Remittance_3\Remitscope_Africa_central_bank.pdf"

# Check if file exists
if os.path.exists(pdf_path):
    print(f"PDF found: {pdf_path}")
else:
    print(f"PDF not found at: {pdf_path}")
    print("Please check the file path.")

PDF found: C:\Users\clint\Desktop\RER\data\Remittance_3\Remitscope_Africa_central_bank.pdf


In [2]:
# First, let's read the first two pages to understand the PDF structure
import fitz  # PyMuPDF

# Open the PDF and read first two pages
doc = fitz.open(pdf_path)
print(f"Total pages in PDF: {len(doc)}")

# Read first two pages
for page_num in range(min(2, len(doc))):
    page = doc.load_page(page_num)
    text = page.get_text()
    print(f"\n{'='*50}")
    print(f"PAGE {page_num + 1}")
    print(f"{'='*50}")
    print(text[:2000])  # Print first 2000 characters
    if len(text) > 2000:
        print("... (truncated)")

doc.close()

Total pages in PDF: 59

PAGE 1
Sending country
Receive cou.. Date
Value
Unit
Source
Algeria
Senegal
2021
0.183414825
USD millions
Australia
Ethiopia
2020
13.59617511
USD millions
Kenya
2024
184,497.099695719
USD millions
Uganda
2022
22
USD millions
Austria
Kenya
2024
13,169.065145833
USD millions
Bahamas
Kenya
2024
1,453.632639905
USD millions
Bahrain
Kenya
2024
5,004.769090469
USD millions
Morocco
2020
30.3408085
USD millions
Belgium
Kenya
2024
22,844.654997935
USD millions
Morocco
2020
387.629403
USD millions
Senegal
2021
88.247787762
USD millions
Benin
Senegal
2021
25.846190336
USD millions
Brazil
Senegal
2021
29.312422216
USD millions
Burkina Faso
Senegal
2021
35.216150094
USD millions
Cameroon
Senegal
2021
43.126214245
USD millions
Canada
Ethiopia
2020
33.89159808
USD millions
Kenya
2024
130,812.76972775
USD millions
Morocco
2020
99.802037
USD millions
Senegal
2021
2.866648143
USD millions
Uganda
2022
82.1
USD millions
Central African Republic Senegal
2021
11.097456712
USD million

In [3]:
# Let's get a smaller sample to see the structure
doc = fitz.open(pdf_path)
page1 = doc.load_page(0)  # First page
text_sample = page1.get_text()

# Print first 1000 characters to see structure
print("First 1000 characters of page 1:")
print(text_sample[:1000])
print("\n" + "="*50)

# Look for table patterns by splitting lines
lines = text_sample.split('\n')
print(f"Total lines on page 1: {len(lines)}")
print("First 20 lines:")
for i, line in enumerate(lines[:20]):
    if line.strip():  # Only print non-empty lines
        print(f"{i+1:2d}: {line.strip()}")

doc.close()

First 1000 characters of page 1:
Sending country
Receive cou.. Date
Value
Unit
Source
Algeria
Senegal
2021
0.183414825
USD millions
Australia
Ethiopia
2020
13.59617511
USD millions
Kenya
2024
184,497.099695719
USD millions
Uganda
2022
22
USD millions
Austria
Kenya
2024
13,169.065145833
USD millions
Bahamas
Kenya
2024
1,453.632639905
USD millions
Bahrain
Kenya
2024
5,004.769090469
USD millions
Morocco
2020
30.3408085
USD millions
Belgium
Kenya
2024
22,844.654997935
USD millions
Morocco
2020
387.629403
USD millions
Senegal
2021
88.247787762
USD millions
Benin
Senegal
2021
25.846190336
USD millions
Brazil
Senegal
2021
29.312422216
USD millions
Burkina Faso
Senegal
2021
35.216150094
USD millions
Cameroon
Senegal
2021
43.126214245
USD millions
Canada
Ethiopia
2020
33.89159808
USD millions
Kenya
2024
130,812.76972775
USD millions
Morocco
2020
99.802037
USD millions
Senegal
2021
2.866648143
USD millions
Uganda
2022
82.1
USD millions
Central African Republic Senegal
2021
11.097456712
USD milli

In [4]:
import camelot
import pandas as pd
import re

# Try to extract tables using Camelot
try:
    # Extract tables from first two pages
    tables = camelot.read_pdf(pdf_path, pages='1-2', flavor='stream')
    print(f"Number of tables found: {len(tables)}")
    
    if len(tables) > 0:
        for i, table in enumerate(tables):
            print(f"\nTable {i+1}:")
            print(f"Shape: {table.df.shape}")
            print(table.df.head(10))
            print("="*50)
    else:
        print("No tables found with Camelot")
        
except Exception as e:
    print(f"Error with Camelot: {e}")
    print("Let's try manual parsing...")

Number of tables found: 2

Table 1:
Shape: (90, 6)
                                                   0                    1  \
0  Indicator:\tB\t-\tCentral\tBank\tremittance\ti...                        
1                                   Sending\tcountry  Receive\tcou.. Date   
2                                            Algeria              Senegal   
3                                          Australia             Ethiopia   
4                                                                   Kenya   
5                                                                  Uganda   
6                                            Austria                Kenya   
7                                            Bahamas                Kenya   
8                                            Bahrain                Kenya   
9                                                                 Morocco   

      2                  3              4                             5  
0                          

In [5]:
# Manual parsing approach since the data is in vertical format
def parse_remittance_data(pdf_path, max_pages=2):
    doc = fitz.open(pdf_path)
    all_data = []
    
    for page_num in range(min(max_pages, len(doc))):
        page = doc.load_page(page_num)
        text = page.get_text()
        
        # Split into lines and clean
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        # Find the pattern: sending country, receiving country, date, value, unit, source
        i = 0
        while i < len(lines) - 5:  # Need at least 6 elements for a complete record
            
            # Skip headers
            if lines[i] in ['Sending country', 'Receive cou.. Date', 'Value', 'Unit', 'Source']:
                i += 1
                continue
                
            # Try to identify a complete record
            sending_country = lines[i]
            
            # Skip if this looks like a continuation or invalid
            if not sending_country or sending_country in ['USD millions', 'BCEAO', 'Central Bank of Kenya', 'National Bank of Ethiopia', 'Bank of Uganda', 'Bank Al-Maghrib']:
                i += 1
                continue
            
            # Look for receiving country in next few lines
            receiving_country = None
            date = None
            value = None
            unit = None
            source = None
            
            j = i + 1
            while j < len(lines) and j < i + 10:  # Look within next 10 lines
                line = lines[j]
                
                # Check if it's a year (date)
                if re.match(r'^\d{4}$', line):
                    date = int(line)
                    receiving_country = lines[j-1] if j > i else None
                    
                    # Next should be value
                    if j + 1 < len(lines):
                        value_line = lines[j + 1]
                        # Clean value - remove commas and convert to float
                        value_clean = re.sub(r'[,\s]', '', value_line)
                        try:
                            value = float(value_clean)
                        except:
                            value = value_line
                    
                    # Next should be unit
                    if j + 2 < len(lines):
                        unit = lines[j + 2]
                    
                    # Try to find source in next few lines
                    for k in range(j + 3, min(j + 8, len(lines))):
                        if lines[k] and lines[k] not in ['USD millions'] and not re.match(r'^\d', lines[k]):
                            source = lines[k]
                            break
                    
                    break
                j += 1
            
            # If we found a complete record, add it
            if all([sending_country, receiving_country, date, value, unit]):
                all_data.append({
                    'Sending country': sending_country,
                    'Receive country': receiving_country,
                    'Date': date,
                    'Value': value,
                    'Unit': unit,
                    'Source': source or ''
                })
                
                # Skip ahead to avoid duplicates
                i = j + 5
            else:
                i += 1
    
    doc.close()
    return all_data

# Parse the data
print("Parsing remittance data...")
data = parse_remittance_data(pdf_path, max_pages=2)
print(f"Found {len(data)} records")

# Convert to DataFrame
df = pd.DataFrame(data)
print("\nFirst 15 records:")
print(df.head(15).to_string(index=False))

# Save to CSV
output_file = r"C:\Users\clint\Desktop\RER\data\Remittance_3\extracted_remittance_data.csv"
df.to_csv(output_file, index=False)
print(f"\nData saved to: {output_file}")

Parsing remittance data...
Found 65 records

First 15 records:
Sending country                  Receive country  Date         Value         Unit           Source
        Algeria                          Senegal  2021      0.183415 USD millions        Australia
           2020                            Kenya  2024 184497.099696 USD millions           Uganda
             22                            Kenya  2024  13169.065146 USD millions          Bahamas
           2024                            Kenya  2024   5004.769090 USD millions          Morocco
     30.3408085                            Kenya  2024  22844.654998 USD millions          Morocco
     387.629403                          Senegal  2021     88.247788 USD millions            Benin
           2021                          Senegal  2021     29.312422 USD millions     Burkina Faso
           2021                          Senegal  2021     43.126214 USD millions           Canada
           2020                            Ken

In [6]:
# Improved parsing approach
def parse_remittance_data_v2(pdf_path, max_pages=2):
    doc = fitz.open(pdf_path)
    all_data = []
    
    # Known source patterns to help identify complete records
    source_patterns = [
        'BCEAO', 'Central Bank of Kenya', 'National Bank of Ethiopia', 
        'Bank of Uganda', 'Bank Al-Maghrib', 'Central Bank of Nigeria',
        'Bank of Ghana', 'Reserve Bank of Malawi', 'Bank of Tanzania'
    ]
    
    for page_num in range(min(max_pages, len(doc))):
        page = doc.load_page(page_num)
        text = page.get_text()
        
        # Split into lines and clean
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        # Remove header lines
        filtered_lines = []
        for line in lines:
            if line not in ['Sending country', 'Receive cou.. Date', 'Value', 'Unit', 'Source']:
                filtered_lines.append(line)
        
        # Process in groups of 5-6 lines (typical record structure)
        i = 0
        while i < len(filtered_lines) - 4:
            
            # Look for a source pattern to anchor our parsing
            source_found = False
            source_idx = -1
            
            # Check next few lines for a source
            for j in range(i, min(i + 8, len(filtered_lines))):
                for source in source_patterns:
                    if source in filtered_lines[j]:
                        source_found = True
                        source_idx = j
                        break
                if source_found:
                    break
            
            if source_found and source_idx >= i + 4:  # Need at least 5 elements before source
                # Work backwards from source to identify the record
                source = filtered_lines[source_idx]
                unit = filtered_lines[source_idx - 1] if source_idx > 0 else ''
                
                # Find value (should be a number)
                value = None
                value_idx = -1
                for k in range(source_idx - 2, max(i - 1, source_idx - 5), -1):
                    try:
                        # Clean and convert value
                        value_clean = re.sub(r'[,\s]', '', filtered_lines[k])
                        value = float(value_clean)
                        value_idx = k
                        break
                    except:
                        continue
                
                if value is not None and value_idx >= i + 2:
                    # Date should be before value
                    date = None
                    date_idx = -1
                    for k in range(value_idx - 1, max(i - 1, value_idx - 3), -1):
                        if re.match(r'^\d{4}$', filtered_lines[k]):
                            date = int(filtered_lines[k])
                            date_idx = k
                            break
                    
                    if date and date_idx >= i + 1:
                        # Receiving country should be before date
                        receiving_country = filtered_lines[date_idx - 1] if date_idx > i else ''
                        
                        # Sending country should be before receiving country
                        sending_country = filtered_lines[date_idx - 2] if date_idx > i + 1 else ''
                        
                        # Validate that we have all required fields
                        if all([sending_country, receiving_country, date, value, unit]):
                            all_data.append({
                                'Sending country': sending_country,
                                'Receive country': receiving_country,
                                'Date': date,
                                'Value': value,
                                'Unit': unit,
                                'Source': source
                            })
                            
                            # Move index past this record
                            i = source_idx + 1
                            continue
            
            i += 1
    
    doc.close()
    return all_data

# Parse with improved algorithm
print("Parsing with improved algorithm...")
data_v2 = parse_remittance_data_v2(pdf_path, max_pages=2)
print(f"Found {len(data_v2)} records")

if len(data_v2) > 0:
    df_v2 = pd.DataFrame(data_v2)
    print("\nFirst 15 records:")
    print(df_v2.head(15).to_string(index=False))
    
    # Save improved results
    output_file_v2 = r"C:\Users\clint\Desktop\RER\data\Remittance_3\extracted_remittance_data_v2.csv"
    df_v2.to_csv(output_file_v2, index=False)
    print(f"\nImproved data saved to: {output_file_v2}")
else:
    print("No records found with improved algorithm")

Parsing with improved algorithm...
Found 2 records

First 15 records:
Sending country Receive country  Date      Value         Unit                    Source
        Somalia          Uganda  2022   30.00000 USD millions                     BCEAO
         Zambia           Kenya  2024 8091.51077 USD millions National Bank of Ethiopia

Improved data saved to: C:\Users\clint\Desktop\RER\data\Remittance_3\extracted_remittance_data_v2.csv


In [7]:
# Let's analyze the structure more carefully
doc = fitz.open(pdf_path)
page = doc.load_page(0)
text = page.get_text()
lines = [line.strip() for line in text.split('\n') if line.strip()]

# Print first 50 lines to understand the pattern
print("First 50 lines to understand pattern:")
for i, line in enumerate(lines[:50]):
    print(f"{i+1:2d}: {line}")

doc.close()

First 50 lines to understand pattern:
 1: Sending country
 2: Receive cou.. Date
 3: Value
 4: Unit
 5: Source
 6: Algeria
 7: Senegal
 8: 2021
 9: 0.183414825
10: USD millions
11: Australia
12: Ethiopia
13: 2020
14: 13.59617511
15: USD millions
16: Kenya
17: 2024
18: 184,497.099695719
19: USD millions
20: Uganda
21: 2022
22: 22
23: USD millions
24: Austria
25: Kenya
26: 2024
27: 13,169.065145833
28: USD millions
29: Bahamas
30: Kenya
31: 2024
32: 1,453.632639905
33: USD millions
34: Bahrain
35: Kenya
36: 2024
37: 5,004.769090469
38: USD millions
39: Morocco
40: 2020
41: 30.3408085
42: USD millions
43: Belgium
44: Kenya
45: 2024
46: 22,844.654997935
47: USD millions
48: Morocco
49: 2020
50: 387.629403


In [8]:
# Final parsing approach based on the clear pattern
def parse_remittance_data_final(pdf_path, max_pages=2):
    doc = fitz.open(pdf_path)
    all_data = []
    
    for page_num in range(min(max_pages, len(doc))):
        page = doc.load_page(page_num)
        text = page.get_text()
        
        # Split into lines and clean
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        # Skip header lines
        start_idx = 0
        for i, line in enumerate(lines):
            if line == 'Source':
                start_idx = i + 1
                break
        
        # Process lines starting after headers
        i = start_idx
        while i < len(lines) - 4:  # Need at least 5 lines for a complete record
            
            # Check if we have a valid record structure
            sending_country = lines[i]
            receiving_country = lines[i + 1]
            
            # Check if next line is a year
            if i + 2 < len(lines) and re.match(r'^\d{4}$', lines[i + 2]):
                date = int(lines[i + 2])
                
                # Check if next line is a number (value)
                if i + 3 < len(lines):
                    try:
                        value_str = lines[i + 3]
                        # Clean value - remove commas
                        value_clean = value_str.replace(',', '')
                        value = float(value_clean)
                        
                        # Check if next line is unit
                        if i + 4 < len(lines) and 'USD millions' in lines[i + 4]:
                            unit = lines[i + 4]
                            
                            # Try to find source in the next few lines or use default
                            source = "Not specified"
                            if i + 5 < len(lines):
                                next_line = lines[i + 5]
                                # If next line doesn't look like a country name, it might be source
                                if not re.match(r'^[A-Z][a-zA-Z\s]+$', next_line) or any(bank in next_line for bank in ['Bank', 'BCEAO']):
                                    source = next_line
                                    i += 6  # Skip past this record including source
                                else:
                                    i += 5  # Skip past this record without source
                            else:
                                i += 5
                            
                            # Add the record
                            all_data.append({
                                'Sending country': sending_country,
                                'Receive country': receiving_country,
                                'Date': date,
                                'Value': value,
                                'Unit': unit,
                                'Source': source
                            })
                            
                            continue
                    except ValueError:
                        pass
            
            # If we couldn't parse a complete record, move to next line
            i += 1
    
    doc.close()
    return all_data

# Parse with final algorithm
print("Parsing with final algorithm...")
data_final = parse_remittance_data_final(pdf_path, max_pages=2)
print(f"Found {len(data_final)} records")

if len(data_final) > 0:
    df_final = pd.DataFrame(data_final)
    
    # Display first 15 records
    print("\nFirst 15 records:")
    print(df_final.head(15).to_string(index=False))
    
    # Display some statistics
    print(f"\nData Summary:")
    print(f"Total records: {len(df_final)}")
    print(f"Date range: {df_final['Date'].min()} - {df_final['Date'].max()}")
    print(f"Unique sending countries: {df_final['Sending country'].nunique()}")
    print(f"Unique receiving countries: {df_final['Receive country'].nunique()}")
    
    # Save final results
    output_file_final = r"C:\Users\clint\Desktop\RER\data\Remittance_3\extracted_remittance_data_final.csv"
    df_final.to_csv(output_file_final, index=False)
    print(f"\nFinal data saved to: {output_file_final}")
    
    # Show the format requested by user
    print(f"\nData in requested format (first 10 rows):")
    print(df_final.head(10).to_string(index=False, float_format='%.20f'))
    
else:
    print("No records found with final algorithm")

Parsing with final algorithm...
Found 87 records

First 15 records:
Sending country Receive country  Date        Value         Unit        Source
        Algeria         Senegal  2021     0.183415 USD millions Not specified
      Australia        Ethiopia  2020    13.596175 USD millions Not specified
   USD millions          Uganda  2022    22.000000 USD millions Not specified
        Austria           Kenya  2024 13169.065146 USD millions Not specified
        Bahamas           Kenya  2024  1453.632640 USD millions Not specified
        Bahrain           Kenya  2024  5004.769090 USD millions Not specified
        Belgium           Kenya  2024 22844.654998 USD millions Not specified
   USD millions         Senegal  2021    88.247788 USD millions Not specified
          Benin         Senegal  2021    25.846190 USD millions Not specified
         Brazil         Senegal  2021    29.312422 USD millions Not specified
   Burkina Faso         Senegal  2021    35.216150 USD millions Not specif

In [None]:
# Add proper source mapping based on receiving countries
def add_proper_sources(df):
    """Add proper source information based on receiving countries"""
    
    source_mapping = {
        'Kenya': 'Central Bank of Kenya',
        'Ethiopia': 'National Bank of Ethiopia', 
        'Uganda': 'Bank of Uganda',
        'Morocco': 'Bank Al-Maghrib',
        'Senegal': 'BCEAO',
        'Mali': 'BCEAO',
        'Burkina Faso': 'BCEAO',
        'Benin': 'BCEAO',
        'Niger': 'BCEAO',
        'Côte d\'Ivoire': 'BCEAO',
        'Guinea-Bissau': 'BCEAO',
        'Togo': 'BCEAO',
        'Nigeria': 'Central Bank of Nigeria',
        'Ghana': 'Bank of Ghana',
        'Tanzania': 'Bank of Tanzania',
        'Malawi': 'Reserve Bank of Malawi'
    }
    
    # Apply source mapping
    df['Source'] = df['Receive country'].map(source_mapping).fillna('Not specified')
    return df

# Apply source mapping to our final data
if 'df_final' in locals() and len(df_final) > 0:
    df_final_with_sources = add_proper_sources(df_final.copy())
    
    print("Data with proper sources (first 15 rows):")
    print(df_final_with_sources.head(15).to_string(index=False))
    
    # Save with proper sources
    output_file_sources = r"C:\Users\clint\Desktop\RER\data\Remittance_3\extracted_remittance_data_with_sources.csv"
    df_final_with_sources.to_csv(output_file_sources, index=False)
    print(f"\nData with proper sources saved to: {output_file_sources}")
    
    # Show exact format as requested
    print("\nExact format as requested:")
    for idx, row in df_final_with_sources.head(10).iterrows():
        print(f"{row['Sending country']}\t{row['Receive country']}\t{row['Date']}\t{row['Value']:.20f}\t{row['Unit']}\t{row['Source']}")
else:
    print("No data to process")

In [9]:
# Let's read more pages to find where the sources actually appear
def explore_pdf_structure(pdf_path, max_pages=10):
    doc = fitz.open(pdf_path)
    print(f"Total pages in PDF: {len(doc)}")
    
    for page_num in range(min(max_pages, len(doc))):
        page = doc.load_page(page_num)
        text = page.get_text()
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        print(f"\n{'='*50}")
        print(f"PAGE {page_num + 1} - Total lines: {len(lines)}")
        print(f"{'='*50}")
        
        # Look for source patterns
        source_indicators = ['BCEAO', 'Central Bank', 'Bank of', 'Reserve Bank', 'National Bank']
        sources_found = []
        
        for i, line in enumerate(lines):
            for indicator in source_indicators:
                if indicator in line:
                    sources_found.append(f"Line {i+1}: {line}")
        
        if sources_found:
            print("SOURCES FOUND:")
            for source in sources_found[:10]:  # Show first 10 sources
                print(source)
        else:
            print("No sources found on this page")
        
        # Show a sample of the content
        print(f"\nFirst 30 lines of page {page_num + 1}:")
        for i, line in enumerate(lines[:30]):
            print(f"{i+1:2d}: {line}")
        
        if len(lines) > 30:
            print("... (truncated)")
    
    doc.close()

# Explore the PDF structure
explore_pdf_structure(pdf_path, max_pages=5)

Total pages in PDF: 59

PAGE 1 - Total lines: 490
SOURCES FOUND:
Line 403: BCEAO
Line 404: National Bank of Ethiopia
Line 404: National Bank of Ethiopia
Line 405: Central Bank of Kenya
Line 405: Central Bank of Kenya
Line 406: Bank of Uganda
Line 407: Central Bank of Kenya
Line 407: Central Bank of Kenya
Line 408: Central Bank of Kenya
Line 408: Central Bank of Kenya

First 30 lines of page 1:
 1: Sending country
 2: Receive cou.. Date
 3: Value
 4: Unit
 5: Source
 6: Algeria
 7: Senegal
 8: 2021
 9: 0.183414825
10: USD millions
11: Australia
12: Ethiopia
13: 2020
14: 13.59617511
15: USD millions
16: Kenya
17: 2024
18: 184,497.099695719
19: USD millions
20: Uganda
21: 2022
22: 22
23: USD millions
24: Austria
25: Kenya
26: 2024
27: 13,169.065145833
28: USD millions
29: Bahamas
30: Kenya
... (truncated)

PAGE 2 - Total lines: 228
SOURCES FOUND:
Line 187: National Bank of Ethiopia
Line 187: National Bank of Ethiopia
Line 188: Central Bank of Kenya
Line 188: Central Bank of Kenya
Line 189

In [10]:
# Comprehensive parser that finds actual sources throughout the PDF
def parse_complete_remittance_data(pdf_path):
    doc = fitz.open(pdf_path)
    all_text_lines = []
    
    # First, collect all text from all pages
    print("Reading all pages...")
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        all_text_lines.extend(lines)
        print(f"Page {page_num + 1}: {len(lines)} lines")
    
    print(f"Total lines collected: {len(all_text_lines)}")
    doc.close()
    
    # Now parse the data systematically
    all_data = []
    
    # Find where the data starts (after headers)
    start_idx = 0
    for i, line in enumerate(all_text_lines):
        if line == 'Source':
            start_idx = i + 1
            break
    
    print(f"Data starts at line {start_idx}")
    
    # Parse records
    i = start_idx
    while i < len(all_text_lines) - 2:
        
        # Look for the pattern: Country1, Country2, Year, Value, Unit
        sending_country = all_text_lines[i]
        
        # Skip obvious non-country lines
        if (sending_country in ['USD millions', 'BCEAO', 'Central Bank of Kenya', 'National Bank of Ethiopia'] or
            re.match(r'^\d+(\.\d+)?$', sending_country) or 
            re.match(r'^\d{4}$', sending_country)):
            i += 1
            continue
        
        if i + 4 < len(all_text_lines):
            receiving_country = all_text_lines[i + 1]
            
            # Check if the third item is a year
            if re.match(r'^\d{4}$', all_text_lines[i + 2]):
                date = int(all_text_lines[i + 2])
                
                # Check if fourth item is a numeric value
                try:
                    value_str = all_text_lines[i + 3]
                    value_clean = value_str.replace(',', '')
                    value = float(value_clean)
                    
                    # Check if fifth item is unit
                    if i + 4 < len(all_text_lines) and 'USD millions' in all_text_lines[i + 4]:
                        unit = all_text_lines[i + 4]
                        
                        # Now look for source in the following lines
                        source = "Not found"
                        for j in range(i + 5, min(i + 15, len(all_text_lines))):
                            line = all_text_lines[j]
                            # Check if this looks like a source (contains "Bank", "BCEAO", etc.)
                            if (any(indicator in line for indicator in ['Bank', 'BCEAO']) and 
                                not re.match(r'^\d', line) and 
                                line not in ['USD millions']):
                                source = line
                                break
                            # Stop if we hit another country or data pattern
                            elif (re.match(r'^[A-Z][a-z]', line) and 
                                  line not in ['USD millions'] and
                                  not any(indicator in line for indicator in ['Bank', 'BCEAO'])):
                                break
                        
                        # Add the record
                        all_data.append({
                            'Sending country': sending_country,
                            'Receive country': receiving_country,
                            'Date': date,
                            'Value': value,
                            'Unit': unit,
                            'Source': source
                        })
                        
                        i += 5  # Move past this record
                        continue
                        
                except ValueError:
                    pass
        
        i += 1
    
    return all_data

# Parse all data with sources
print("Parsing complete remittance data with actual sources...")
complete_data = parse_complete_remittance_data(pdf_path)
print(f"Found {len(complete_data)} complete records")

if len(complete_data) > 0:
    df_complete = pd.DataFrame(complete_data)
    
    print("\nFirst 20 records with actual sources:")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    print(df_complete.head(20).to_string(index=False))
    
    # Show statistics
    print(f"\nData Summary:")
    print(f"Total records: {len(df_complete)}")
    print(f"Records with sources found: {len(df_complete[df_complete['Source'] != 'Not found'])}")
    print(f"Unique sources: {df_complete['Source'].nunique()}")
    print(f"Sources found: {df_complete['Source'].unique()}")
    
    # Save complete data
    output_file_complete = r"C:\Users\clint\Desktop\RER\data\Remittance_3\complete_remittance_data.csv"
    df_complete.to_csv(output_file_complete, index=False)
    print(f"\nComplete data saved to: {output_file_complete}")

else:
    print("No complete records found")

Parsing complete remittance data with actual sources...
Reading all pages...
Page 1: 490 lines
Page 2: 228 lines
Page 3: 300 lines
Page 4: 251 lines
Page 5: 105 lines
Page 6: 1 lines
Page 7: 2 lines
Page 8: 1 lines
Page 9: 1 lines
Page 10: 2 lines
Page 11: 1 lines
Page 12: 5 lines
Page 13: 3 lines
Page 14: 1 lines
Page 15: 6 lines
Page 16: 1 lines
Page 17: 5 lines
Page 18: 3 lines
Page 19: 5 lines
Page 20: 3 lines
Page 21: 6 lines
Page 22: 10 lines
Page 23: 6 lines
Page 24: 51 lines
Page 25: 1 lines
Page 26: 2 lines
Page 27: 1 lines
Page 28: 67 lines
Page 29: 5 lines
Page 30: 123 lines
Page 31: 124 lines
Page 32: 11 lines
Page 33: 1 lines
Page 34: 1 lines
Page 35: 1 lines
Page 36: 2 lines
Page 37: 1 lines
Page 38: 1 lines
Page 39: 1 lines
Page 40: 1 lines
Page 41: 2 lines
Page 42: 1 lines
Page 43: 16 lines
Page 44: 1 lines
Page 45: 1 lines
Page 46: 1 lines
Page 47: 6 lines
Page 48: 1 lines
Page 49: 1 lines
Page 50: 6 lines
Page 51: 1 lines
Page 52: 1 lines
Page 53: 6 lines
Page 54: 1 l