In [36]:
import camelot
import pandas as pd
import os

# Define the PDF file paths
africa_pdf = r"C:\Users\clint\Desktop\RER\data\Remittance_4\Africa.pdf"
latam_pdf = r"C:\Users\clint\Desktop\RER\data\Remittance_4\LatAm.pdf"

# Check if files exist
print(f"Africa PDF exists: {os.path.exists(africa_pdf)}")
print(f"LatAm PDF exists: {os.path.exists(latam_pdf)}")

# Extract tables from Africa.pdf (first page only)
print("\n=== Extracting tables from Africa.pdf (page 1) ===")
try:
    africa_tables = camelot.read_pdf(africa_pdf, pages='1', flavor='stream')
    print(f"Number of tables found in Africa.pdf: {len(africa_tables)}")
    
    # Convert tables to DataFrames
    africa_dfs = []
    for i, table in enumerate(africa_tables):
        df = table.df
        africa_dfs.append(df)
        print(f"\nAfrica Table {i+1} shape: {df.shape}")
        print(f"Africa Table {i+1} preview:")
        print(df.head())
        
except Exception as e:
    print(f"Error reading Africa.pdf: {e}")
    africa_dfs = []

# Extract tables from LatAm.pdf (first page only)
print("\n=== Extracting tables from LatAm.pdf (page 1) ===")
try:
    latam_tables = camelot.read_pdf(latam_pdf, pages='1', flavor='stream')
    print(f"Number of tables found in LatAm.pdf: {len(latam_tables)}")
    
    # Convert tables to DataFrames
    latam_dfs = []
    for i, table in enumerate(latam_tables):
        df = table.df
        latam_dfs.append(df)
        print(f"\nLatAm Table {i+1} shape: {df.shape}")
        print(f"LatAm Table {i+1} preview:")
        print(df.head())
        
except Exception as e:
    print(f"Error reading LatAm.pdf: {e}")
    latam_dfs = []

print(f"\n=== Summary ===")
print(f"Africa DataFrames created: {len(africa_dfs)}")
print(f"LatAm DataFrames created: {len(latam_dfs)}")

Africa PDF exists: True
LatAm PDF exists: True

=== Extracting tables from Africa.pdf (page 1) ===
Number of tables found in Africa.pdf: 1

Africa Table 1 shape: (130, 6)
Africa Table 1 preview:
                                                   0                    1  \
0  Indicator:\tB\t-\tCentral\tBank\tremittance\ti...                        
1                                   Sending\tcountry  Receive\tcou.. Date   
2                                            Algeria              Senegal   
3                                          Australia             Ethiopia   
4                                                                   Kenya   

      2                  3              4                             5  
0                                                                        
1                    Value           Unit                        Source  
2  2021        0.183414825  USD\tmillions                         BCEAO  
3  2020        13.59617511  USD\tmillions  Nat

In [37]:
africa_dfs

[                                                     0                    1  \
 0    Indicator:\tB\t-\tCentral\tBank\tremittance\ti...                        
 1                                     Sending\tcountry  Receive\tcou.. Date   
 2                                              Algeria              Senegal   
 3                                            Australia             Ethiopia   
 4                                                                     Kenya   
 ..                                                 ...                  ...   
 125                                                                   Kenya   
 126                                                                 Morocco   
 127                                                                 Senegal   
 128                                                                  Uganda   
 129                                             Zambia                Kenya   
 
         2                   3        

In [38]:
# Clean and structure the DataFrames
def clean_remittance_df(df, region_name):
    """Clean and structure the remittance DataFrame"""
    # Make a copy to avoid modifying the original
    cleaned_df = df.copy()
    
    # Set proper column names from row 1 (which contains the headers)
    if len(cleaned_df) > 1:
        # Extract headers from row 1
        headers = cleaned_df.iloc[1].tolist()
        # Clean up the headers
        headers = [str(h).replace('\t', ' ').strip() if pd.notna(h) and str(h).strip() else f'Column_{i}' 
                  for i, h in enumerate(headers)]
        cleaned_df.columns = headers
        
        # Remove the first two rows (metadata and headers)
        cleaned_df = cleaned_df.iloc[2:].reset_index(drop=True)
        
        # Clean up the data - remove tab characters and extra spaces
        for col in cleaned_df.columns:
            if cleaned_df[col].dtype == 'object':
                cleaned_df[col] = cleaned_df[col].astype(str).str.replace('\t', ' ', regex=False)
                cleaned_df[col] = cleaned_df[col].str.strip()
                # Replace 'nan' strings with actual NaN
                cleaned_df[col] = cleaned_df[col].replace('nan', pd.NA)
        
        # Add region identifier
        cleaned_df['Region'] = region_name
        
    return cleaned_df

# Clean the Africa DataFrame
if africa_dfs:
    africa_df_clean = clean_remittance_df(africa_dfs[0], 'Africa')
    print("=== Cleaned Africa DataFrame ===")
    print(f"Shape: {africa_df_clean.shape}")
    print(f"Columns: {list(africa_df_clean.columns)}")
    print("\nFirst 5 rows:")
    print(africa_df_clean.head())
    print("\nData types:")
    print(africa_df_clean.dtypes)

# Clean the LatAm DataFrame  
if latam_dfs:
    latam_df_clean = clean_remittance_df(latam_dfs[0], 'Latin America')
    print("\n=== Cleaned LatAm DataFrame ===")
    print(f"Shape: {latam_df_clean.shape}")
    print(f"Columns: {list(latam_df_clean.columns)}")
    print("\nFirst 5 rows:")
    print(latam_df_clean.head())
    print("\nData types:")
    print(latam_df_clean.dtypes)

=== Cleaned Africa DataFrame ===
Shape: (128, 7)
Columns: ['Sending country', 'Receive cou.. Date', 'Column_2', 'Value', 'Unit', 'Source', 'Region']

First 5 rows:
  Sending country Receive cou.. Date Column_2              Value  \
0         Algeria            Senegal     2021        0.183414825   
1       Australia           Ethiopia     2020        13.59617511   
2                              Kenya     2024  184,497.099695719   
3                             Uganda     2022                 22   
4         Austria              Kenya     2024   13,169.065145833   

           Unit                     Source  Region  
0  USD millions                      BCEAO  Africa  
1  USD millions  National Bank of Ethiopia  Africa  
2  USD millions      Central Bank of Kenya  Africa  
3  USD millions             Bank of Uganda  Africa  
4  USD millions      Central Bank of Kenya  Africa  

Data types:
Sending country       object
Receive cou.. Date    object
Column_2              object
Value    

In [39]:
# Combine both DataFrames if they exist
combined_df = None
if africa_dfs and latam_dfs:
    combined_df = pd.concat([africa_df_clean, latam_df_clean], ignore_index=True)
    print("=== Combined DataFrame ===")
    print(f"Total shape: {combined_df.shape}")
    print(f"Africa rows: {len(africa_df_clean)}")
    print(f"LatAm rows: {len(latam_df_clean)}")
    
    # Show some summary statistics
    print(f"\nUnique sending countries: {combined_df['Sending country'].nunique()}")
    print(f"Unique receiving countries: {combined_df['Receive country'].nunique()}")
    
    # Handle Date column more carefully
    try:
        # Filter out non-numeric dates and convert to numeric
        date_series = pd.to_numeric(combined_df['Date'], errors='coerce')
        date_series = date_series.dropna()
        if len(date_series) > 0:
            print(f"Date range: {date_series.min()} to {date_series.max()}")
        else:
            print("Date range: No valid numeric dates found")
    except Exception as e:
        print(f"Could not determine date range: {e}")
    
    # Display sample data
    print("\nSample of combined data:")
    print(combined_df.sample(min(10, len(combined_df))))

# Summary of available DataFrames
print("\n" + "="*50)
print("AVAILABLE DATAFRAMES:")
print("="*50)
if africa_dfs:
    print(f"• africa_df_clean: {africa_df_clean.shape[0]} rows × {africa_df_clean.shape[1]} columns")
if latam_dfs:
    print(f"• latam_df_clean: {latam_df_clean.shape[0]} rows × {latam_df_clean.shape[1]} columns")
if combined_df is not None:
    print(f"• combined_df: {combined_df.shape[0]} rows × {combined_df.shape[1]} columns")
print("\nYou can now use these DataFrames for further analysis!")

=== Combined DataFrame ===
Total shape: (728, 9)
Africa rows: 128
LatAm rows: 600

Unique sending countries: 207
Unique receiving countries: 16
Date range: 2019.0 to 2022.0

Sample of combined data:
    Sending country Receive cou.. Date Column_2               Value  \
28                             Senegal     2021        89.989410749   
541                                NaN      NaN            0.276779   
378                                NaN      NaN           3.7173961   
123                              Kenya     2024  2,637,772.27432718   
559                                NaN      NaN        78.256812047   
364                                NaN      NaN           29.728509   
3                               Uganda     2022                  22   
233                                NaN      NaN        14.375961662   
218        Bulgaria                NaN      NaN          0.09197525   
481      Martinique                NaN      NaN          0.00524036   

             Unit  

In [40]:
africa_dfs

[                                                     0                    1  \
 0    Indicator:\tB\t-\tCentral\tBank\tremittance\ti...                        
 1                                     Sending\tcountry  Receive\tcou.. Date   
 2                                              Algeria              Senegal   
 3                                            Australia             Ethiopia   
 4                                                                     Kenya   
 ..                                                 ...                  ...   
 125                                                                   Kenya   
 126                                                                 Morocco   
 127                                                                 Senegal   
 128                                                                  Uganda   
 129                                             Zambia                Kenya   
 
         2                   3        

In [41]:
# Create a clean DataFrame for Data Wrangler compatibility
# This will have proper column names and clean data structure

# Clean the raw Africa DataFrame
if africa_dfs:
    raw_df = africa_dfs[0]
    
    # Extract the header row (row 1) and clean it
    headers = raw_df.iloc[1].tolist()
    clean_headers = []
    for i, header in enumerate(headers):
        if pd.notna(header) and str(header).strip():
            clean_name = str(header).replace('\t', '_').replace(' ', '_').strip()
            clean_headers.append(clean_name)
        else:
            clean_headers.append(f'Column_{i}')
    
    # Create clean DataFrame starting from row 2 (skip metadata and headers)
    africa_clean = raw_df.iloc[2:].copy()
    africa_clean.columns = clean_headers
    
    # Clean the data values - remove tabs and extra spaces
    for col in africa_clean.columns:
        if africa_clean[col].dtype == 'object':
            africa_clean[col] = africa_clean[col].astype(str).str.replace('\t', ' ', regex=False)
            africa_clean[col] = africa_clean[col].str.strip()
            # Replace 'nan' with actual NaN
            africa_clean[col] = africa_clean[col].replace('nan', None)
    
    # Reset index
    africa_clean = africa_clean.reset_index(drop=True)
    
    print("Clean Africa DataFrame for Data Wrangler:")
    print(f"Shape: {africa_clean.shape}")
    print(f"Columns: {list(africa_clean.columns)}")
    print("\nThis DataFrame should work properly with Data Wrangler!")
    
    # Display the cleaned DataFrame
    africa_clean

Clean Africa DataFrame for Data Wrangler:
Shape: (128, 6)
Columns: ['Sending_country', 'Receive_cou.._Date', 'Column_2', 'Value', 'Unit', 'Source']

This DataFrame should work properly with Data Wrangler!


In [42]:
africa_clean

Unnamed: 0,Sending_country,Receive_cou.._Date,Column_2,Value,Unit,Source
0,Algeria,Senegal,2021,0.183414825,USD millions,BCEAO
1,Australia,Ethiopia,2020,13.59617511,USD millions,National Bank of Ethiopia
2,,Kenya,2024,184497.099695719,USD millions,Central Bank of Kenya
3,,Uganda,2022,22,USD millions,Bank of Uganda
4,Austria,Kenya,2024,13169.065145833,USD millions,Central Bank of Kenya
...,...,...,...,...,...,...
123,,Kenya,2024,2637772.27432718,USD millions,Central Bank of Kenya
124,,Morocco,2020,422.388121,USD millions,Bank Al-Maghrib
125,,Senegal,2021,479.695927280,USD millions,BCEAO
126,,Uganda,2022,184.5,USD millions,Bank of Uganda


In [43]:
# Create clean DataFrames with proper column names
desired_columns = ['Sending Country', 'Receiving Country', 'Year', 'Value', 'Unit', 'Source']

# Clean Africa DataFrame
if africa_dfs:
    raw_df = africa_dfs[0]
    
    # Create clean DataFrame starting from row 2 (skip metadata and headers)
    africa_final = raw_df.iloc[2:].copy()
    
    # Set the desired column names
    africa_final.columns = desired_columns
    
    # Clean the data values - remove tabs and extra spaces
    for col in africa_final.columns:
        if africa_final[col].dtype == 'object':
            africa_final[col] = africa_final[col].astype(str).str.replace('\t', ' ', regex=False)
            africa_final[col] = africa_final[col].str.strip()
            # Replace 'nan' with actual NaN
            africa_final[col] = africa_final[col].replace('nan', None)
    
    # Reset index
    africa_final = africa_final.reset_index(drop=True)
    
    print("Africa DataFrame with proper column names:")
    print(f"Shape: {africa_final.shape}")
    print(f"Columns: {list(africa_final.columns)}")
    print("\nFirst 5 rows:")
    print(africa_final.head())

# Clean LatAm DataFrame
if latam_dfs:
    raw_df = latam_dfs[0]
    
    # Create clean DataFrame starting from row 2 (skip metadata and headers)
    latam_final = raw_df.iloc[2:].copy()
    
    # Set the desired column names
    latam_final.columns = desired_columns
    
    # Clean the data values - remove tabs and extra spaces
    for col in latam_final.columns:
        if latam_final[col].dtype == 'object':
            latam_final[col] = latam_final[col].astype(str).str.replace('\t', ' ', regex=False)
            latam_final[col] = latam_final[col].str.strip()
            # Replace 'nan' with actual NaN
            latam_final[col] = latam_final[col].replace('nan', None)
    
    # Reset index
    latam_final = latam_final.reset_index(drop=True)
    
    print("\nLatAm DataFrame with proper column names:")
    print(f"Shape: {latam_final.shape}")
    print(f"Columns: {list(latam_final.columns)}")
    print("\nFirst 5 rows:")
    print(latam_final.head())

print("\nThese DataFrames should work perfectly with Data Wrangler!")

Africa DataFrame with proper column names:
Shape: (128, 6)
Columns: ['Sending Country', 'Receiving Country', 'Year', 'Value', 'Unit', 'Source']

First 5 rows:
  Sending Country Receiving Country  Year              Value          Unit  \
0         Algeria           Senegal  2021        0.183414825  USD millions   
1       Australia          Ethiopia  2020        13.59617511  USD millions   
2                             Kenya  2024  184,497.099695719  USD millions   
3                            Uganda  2022                 22  USD millions   
4         Austria             Kenya  2024   13,169.065145833  USD millions   

                      Source  
0                      BCEAO  
1  National Bank of Ethiopia  
2      Central Bank of Kenya  
3             Bank of Uganda  
4      Central Bank of Kenya  

LatAm DataFrame with proper column names:
Shape: (600, 6)
Columns: ['Sending Country', 'Receiving Country', 'Year', 'Value', 'Unit', 'Source']

First 5 rows:
  Sending Country Receiving

In [44]:
# Forward fill missing values for africa_final - handle both NaN and empty strings
if 'africa_final' in locals():
    print("Before forward fill:")
    print(africa_final.head(10))
    print("\nMissing values per column before forward fill:")
    print(africa_final.isnull().sum())
    
    # Check for empty strings and blanks too
    print("\nEmpty/blank strings per column:")
    for col in africa_final.columns:
        empty_count = (africa_final[col] == '').sum() + (africa_final[col] == ' ').sum()
        print(f"{col}: {empty_count}")

    # First, replace empty strings and whitespace-only strings with NaN
    africa_final_clean = africa_final.copy()
    for col in africa_final_clean.columns:
        if africa_final_clean[col].dtype == 'object':  # Only for text columns
            # Replace empty strings, single spaces, and whitespace-only strings with NaN
            africa_final_clean[col] = africa_final_clean[col].replace(['', ' ', '  ', '   '], pd.NA)
            # Also replace strings that are just whitespace
            africa_final_clean[col] = africa_final_clean[col].apply(
                lambda x: pd.NA if isinstance(x, str) and x.strip() == '' else x
            )

    print("\nAfter converting empty strings to NaN:")
    print("Missing values per column:")
    print(africa_final_clean.isnull().sum())

    # Now apply forward fill to replace missing values
    africa_final_filled = africa_final_clean.ffill()

    print("\n" + "="*50)
    print("After forward fill:")
    print(africa_final_filled.head(10))
    print("\nMissing values per column after forward fill:")
    print(africa_final_filled.isnull().sum())

    # Update africa_final with the filled version
    africa_final = africa_final_filled
    print("\nAfrica DataFrame has been updated with forward-filled values!")
    
    # Display the final result
    africa_final

Before forward fill:
  Sending Country Receiving Country  Year              Value          Unit  \
0         Algeria           Senegal  2021        0.183414825  USD millions   
1       Australia          Ethiopia  2020        13.59617511  USD millions   
2                             Kenya  2024  184,497.099695719  USD millions   
3                            Uganda  2022                 22  USD millions   
4         Austria             Kenya  2024   13,169.065145833  USD millions   
5         Bahamas             Kenya  2024    1,453.632639905  USD millions   
6         Bahrain             Kenya  2024    5,004.769090469  USD millions   
7                           Morocco  2020         30.3408085  USD millions   
8         Belgium             Kenya  2024   22,844.654997935  USD millions   
9                           Morocco  2020         387.629403  USD millions   

                      Source  
0                      BCEAO  
1  National Bank of Ethiopia  
2      Central Bank of Keny

In [45]:
africa_final

Unnamed: 0,Sending Country,Receiving Country,Year,Value,Unit,Source
0,Algeria,Senegal,2021,0.183414825,USD millions,BCEAO
1,Australia,Ethiopia,2020,13.59617511,USD millions,National Bank of Ethiopia
2,Australia,Kenya,2024,184497.099695719,USD millions,Central Bank of Kenya
3,Australia,Uganda,2022,22,USD millions,Bank of Uganda
4,Austria,Kenya,2024,13169.065145833,USD millions,Central Bank of Kenya
...,...,...,...,...,...,...
123,United States,Kenya,2024,2637772.27432718,USD millions,Central Bank of Kenya
124,United States,Morocco,2020,422.388121,USD millions,Bank Al-Maghrib
125,United States,Senegal,2021,479.695927280,USD millions,BCEAO
126,United States,Uganda,2022,184.5,USD millions,Bank of Uganda


In [46]:
# Complete processing for latam_final - apply all the same steps as africa_final
if 'latam_final' in locals():
    print("=== COMPLETE LATAM PROCESSING ===")
    
    # Step 1: Show original state
    print("Step 1: Original LatAm DataFrame:")
    print(f"Shape: {latam_final.shape}")
    print(f"Columns: {list(latam_final.columns)}")
    print("\nFirst 10 rows:")
    print(latam_final.head(10))
    
    # Step 2: Check for missing values and empty strings
    print("\nStep 2: Missing values per column (NaN):")
    print(latam_final.isnull().sum())
    
    print("\nEmpty/blank strings per column:")
    for col in latam_final.columns:
        empty_count = (latam_final[col] == '').sum() + (latam_final[col] == ' ').sum()
        print(f"{col}: {empty_count}")

    # Step 3: Convert empty strings to NaN
    print("\nStep 3: Converting empty strings and blanks to NaN...")
    latam_final_clean = latam_final.copy()
    for col in latam_final_clean.columns:
        if latam_final_clean[col].dtype == 'object':  # Only for text columns
            # Replace empty strings, single spaces, and whitespace-only strings with NaN
            latam_final_clean[col] = latam_final_clean[col].replace(['', ' ', '  ', '   '], pd.NA)
            # Also replace strings that are just whitespace
            latam_final_clean[col] = latam_final_clean[col].apply(
                lambda x: pd.NA if isinstance(x, str) and x.strip() == '' else x
            )

    print("After converting empty strings to NaN:")
    print("Missing values per column:")
    print(latam_final_clean.isnull().sum())

    # Step 4: Apply forward fill
    print("\nStep 4: Applying forward fill...")
    latam_final_filled = latam_final_clean.ffill()

    print("After forward fill:")
    print("Missing values per column:")
    print(latam_final_filled.isnull().sum())

    # Step 5: Additional data cleaning (same as done for africa)
    print("\nStep 5: Additional data cleaning...")
    
    # Clean text columns further
    for col in latam_final_filled.columns:
        if latam_final_filled[col].dtype == 'object':
            # Remove any remaining tabs and normalize spaces
            latam_final_filled[col] = latam_final_filled[col].astype(str).str.replace('\t', ' ', regex=False)
            latam_final_filled[col] = latam_final_filled[col].str.strip()
            # Replace 'nan' strings with actual NaN
            latam_final_filled[col] = latam_final_filled[col].replace(['nan', 'None'], pd.NA)

    # Step 6: Final result
    print("\nStep 6: Final LatAm DataFrame:")
    print(f"Shape: {latam_final_filled.shape}")
    print(f"Columns: {list(latam_final_filled.columns)}")
    print("\nFirst 10 rows after complete processing:")
    print(latam_final_filled.head(10))
    print("\nFinal missing values per column:")
    print(latam_final_filled.isnull().sum())

    # Update latam_final with the fully processed version
    latam_final = latam_final_filled
    print("\n" + "="*60)
    print("LatAm DataFrame has been completely processed!")
    print("All the same cleaning steps applied to Africa have been applied to LatAm.")
    print("="*60)
    
    # Display the final result
    latam_final

=== COMPLETE LATAM PROCESSING ===
Step 1: Original LatAm DataFrame:
Shape: (600, 6)
Columns: ['Sending Country', 'Receiving Country', 'Year', 'Value', 'Unit', 'Source']

First 10 rows:
  Sending Country Receiving Country  Year        Value          Unit  \
0     Afghanistan           Ecuador  2021   0.03859066  USD millions   
1                            Mexico  2022     0.295915  USD millions   
2                            Panama  2022  0.000387472  USD millions   
3         Albania           Ecuador  2021   1.89796382  USD millions   
4                            Mexico  2022     0.023831  USD millions   
5                            Panama  2022  0.038935399  USD millions   
6  American Samoa           Ecuador  2021    0.0553291  USD millions   
7          Angola            Brazil  2022   8.32422068  USD millions   
8                           Ecuador  2021   0.00924998  USD millions   
9                            Mexico  2022     0.021185  USD millions   

              Source  

In [47]:
latam_final

Unnamed: 0,Sending Country,Receiving Country,Year,Value,Unit,Source
0,Afghanistan,Ecuador,2021,0.03859066,USD millions,Central Bank Data
1,Afghanistan,Mexico,2022,0.295915,USD millions,Central Bank Data
2,Afghanistan,Panama,2022,0.000387472,USD millions,INEC
3,Albania,Ecuador,2021,1.89796382,USD millions,Central Bank Data
4,Albania,Mexico,2022,0.023831,USD millions,Central Bank Data
...,...,...,...,...,...,...
595,Yemen,Mexico,2022,0.005701,USD millions,Central Bank Data
596,Zambia,Ecuador,2021,0.01060072,USD millions,Central Bank Data
597,Zambia,Mexico,2022,0.00067,USD millions,Central Bank Data
598,Zimbabwe,Ecuador,2021,0.001468,USD millions,Central Bank Data


In [48]:
# Combine the processed Africa and LatAm DataFrames into one comprehensive dataset
if 'africa_final' in locals() and 'latam_final' in locals():
    print("=== COMBINING AFRICA AND LATAM DATAFRAMES ===")
    
    # Add region identifier to each DataFrame before combining
    africa_with_region = africa_final.copy()
    africa_with_region['Region'] = 'Africa'
    
    latam_with_region = latam_final.copy()
    latam_with_region['Region'] = 'Latin America'
    
    # Combine both DataFrames
    combined_final = pd.concat([africa_with_region, latam_with_region], ignore_index=True)
    
    print("Combined DataFrame Information:")
    print(f"Total shape: {combined_final.shape}")
    print(f"Africa rows: {len(africa_with_region)}")
    print(f"LatAm rows: {len(latam_with_region)}")
    print(f"Total rows: {len(combined_final)}")
    
    print(f"\nColumns: {list(combined_final.columns)}")
    
    # Show summary statistics
    print(f"\nSummary Statistics:")
    print(f"Unique sending countries: {combined_final['Sending Country'].nunique()}")
    print(f"Unique receiving countries: {combined_final['Receiving Country'].nunique()}")
    print(f"Year range: {combined_final['Year'].min()} to {combined_final['Year'].max()}")
    
    print(f"\nData by Region:")
    print(combined_final['Region'].value_counts())
    
    print(f"\nMissing values per column:")
    print(combined_final.isnull().sum())
    
    print(f"\nFirst 5 rows from each region:")
    print("\nAfrica sample:")
    print(combined_final[combined_final['Region'] == 'Africa'].head())
    print("\nLatin America sample:")
    print(combined_final[combined_final['Region'] == 'Latin America'].head())
    
    print("\n" + "="*60)
    print("SUCCESS! Combined DataFrame 'combined_final' is ready!")
    print("This DataFrame contains all processed remittance data from both regions.")
    print("Perfect for Data Wrangler and comprehensive analysis!")
    print("="*60)
    
    # Display the combined DataFrame
    combined_final

else:
    print("Error: Both africa_final and latam_final DataFrames need to be available for combining.")

=== COMBINING AFRICA AND LATAM DATAFRAMES ===
Combined DataFrame Information:
Total shape: (728, 7)
Africa rows: 128
LatAm rows: 600
Total rows: 728

Columns: ['Sending Country', 'Receiving Country', 'Year', 'Value', 'Unit', 'Source', 'Region']

Summary Statistics:
Unique sending countries: 206
Unique receiving countries: 20
Year range: 2019 to 2024

Data by Region:
Region
Latin America    600
Africa           128
Name: count, dtype: int64

Missing values per column:
Sending Country      0
Receiving Country    0
Year                 0
Value                0
Unit                 0
Source               0
Region               0
dtype: int64

First 5 rows from each region:

Africa sample:
  Sending Country Receiving Country  Year              Value          Unit  \
0         Algeria           Senegal  2021        0.183414825  USD millions   
1       Australia          Ethiopia  2020        13.59617511  USD millions   
2       Australia             Kenya  2024  184,497.099695719  USD millio

In [49]:
combined_final

Unnamed: 0,Sending Country,Receiving Country,Year,Value,Unit,Source,Region
0,Algeria,Senegal,2021,0.183414825,USD millions,BCEAO,Africa
1,Australia,Ethiopia,2020,13.59617511,USD millions,National Bank of Ethiopia,Africa
2,Australia,Kenya,2024,184497.099695719,USD millions,Central Bank of Kenya,Africa
3,Australia,Uganda,2022,22,USD millions,Bank of Uganda,Africa
4,Austria,Kenya,2024,13169.065145833,USD millions,Central Bank of Kenya,Africa
...,...,...,...,...,...,...,...
723,Yemen,Mexico,2022,0.005701,USD millions,Central Bank Data,Latin America
724,Zambia,Ecuador,2021,0.01060072,USD millions,Central Bank Data,Latin America
725,Zambia,Mexico,2022,0.00067,USD millions,Central Bank Data,Latin America
726,Zimbabwe,Ecuador,2021,0.001468,USD millions,Central Bank Data,Latin America


In [50]:
# Save the combined_final DataFrame as CSV
if 'combined_final' in locals():
    # Define the file path
    csv_file_path = r"C:\Users\clint\Desktop\RER\code\12.csv"
    
    # Save to CSV
    combined_final.to_csv(csv_file_path, index=False)
    
    print(f"✅ Successfully saved combined_final DataFrame to: {csv_file_path}")
    print(f"📊 File contains {len(combined_final)} rows and {len(combined_final.columns)} columns")
    print(f"📁 File size: {os.path.getsize(csv_file_path)} bytes")
    
    # Verify the file was created
    if os.path.exists(csv_file_path):
        print(f"✅ File verified: {csv_file_path} exists")
        
        # Show a quick preview of what was saved
        print(f"\n📋 Quick preview of saved data:")
        print(f"Columns: {list(combined_final.columns)}")
        print(f"Shape: {combined_final.shape}")
        print(f"Regions: {combined_final['Region'].value_counts().to_dict()}")
    else:
        print(f"❌ Error: File was not created at {csv_file_path}")
        
else:
    print("❌ Error: combined_final DataFrame not found. Please run the previous cells first.")

✅ Successfully saved combined_final DataFrame to: C:\Users\clint\Desktop\RER\code\12.csv
📊 File contains 728 rows and 7 columns
📁 File size: 56000 bytes
✅ File verified: C:\Users\clint\Desktop\RER\code\12.csv exists

📋 Quick preview of saved data:
Columns: ['Sending Country', 'Receiving Country', 'Year', 'Value', 'Unit', 'Source', 'Region']
Shape: (728, 7)
Regions: {'Latin America': 600, 'Africa': 128}
