In [14]:
import pandas as pd

df = pd.read_csv(r'C:\Users\clint\Desktop\RER\Code\14.csv')
df

Unnamed: 0,Sending Country,Receiving Country,Year,Value,Unit,Source,Region
0,Algeria,Senegal,2021,0.183414825,USD millions,BCEAO,Africa
1,Australia,Ethiopia,2020,13.59617511,USD millions,National Bank of Ethiopia,Africa
2,Australia,Kenya,2024,184497.099695719,USD millions,Central Bank of Kenya,Africa
3,Australia,Uganda,2022,22,USD millions,Bank of Uganda,Africa
4,Austria,Kenya,2024,13169.065145833,USD millions,Central Bank of Kenya,Africa
...,...,...,...,...,...,...,...
3975,Suriname,United States,2019,5.022,USD millions,Roland Kpodar (IMF),North America
3976,Suriname,United States,2020,3.275,USD millions,Roland Kpodar (IMF),North America
3977,Suriname,Vietnam,2018,1.401,USD millions,Roland Kpodar (IMF),Asia
3978,Suriname,Vietnam,2019,1.453,USD millions,Roland Kpodar (IMF),Asia


In [15]:
# Check the structure of the data
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
df.head()

Dataset shape: (3980, 7)

Column names:
['Sending Country', 'Receiving Country', 'Year', 'Value', 'Unit', 'Source', 'Region']

First few rows:


Unnamed: 0,Sending Country,Receiving Country,Year,Value,Unit,Source,Region
0,Algeria,Senegal,2021,0.183414825,USD millions,BCEAO,Africa
1,Australia,Ethiopia,2020,13.59617511,USD millions,National Bank of Ethiopia,Africa
2,Australia,Kenya,2024,184497.099695719,USD millions,Central Bank of Kenya,Africa
3,Australia,Uganda,2022,22.0,USD millions,Bank of Uganda,Africa
4,Austria,Kenya,2024,13169.065145833,USD millions,Central Bank of Kenya,Africa


In [16]:
# Check for empty/null Region entries
print("Missing Region values:")
print(f"Total missing: {df['Region'].isnull().sum()}")
print(f"Empty strings: {(df['Region'] == '').sum()}")

# Show rows with missing regions
missing_regions = df[df['Region'].isnull() | (df['Region'] == '')]
print(f"\nRows with missing regions: {len(missing_regions)}")
if len(missing_regions) > 0:
    print("Sample of missing regions:")
    print(missing_regions[['Receiving Country', 'Region']].head(10))
    
    # Show unique receiving countries with missing regions
    print("\nUnique receiving countries with missing regions:")
    print(missing_regions['Receiving Country'].unique())

Missing Region values:
Total missing: 0
Empty strings: 0

Rows with missing regions: 0


In [17]:
# Create a mapping from existing country-region pairs
country_region_mapping = df[df['Region'].notna()].drop_duplicates(['Receiving Country', 'Region'])
country_region_dict = dict(zip(country_region_mapping['Receiving Country'], country_region_mapping['Region']))

print("Existing country-region mappings:")
print(f"Total countries with known regions: {len(country_region_dict)}")

# Check existing regions in the dataset
print("\nExisting regions in the dataset:")
print(df['Region'].value_counts())

Existing country-region mappings:
Total countries with known regions: 214

Existing regions in the dataset:
Region
Asia             1653
Latin America     837
Africa            515
Europe            438
North America     414
Oceania           123
Name: count, dtype: int64


In [18]:
# Create manual mapping for countries missing regions
missing_country_mappings = {
    'Cabo Verde': 'Africa',
    'Kyrgyz Republic': 'Asia',
    'Puerto Rico': 'North America',
    'American Samoa': 'Oceania',
    'Aruba': 'Latin America',
    'Bahamas. The': 'North America',
    'Bermuda': 'North America',
    'British Virgin Islands': 'North America',
    'Brunei Darussalam': 'Asia',
    'Cayman Islands': 'North America',
    'Congo. Dem. Rep.': 'Africa',
    'Congo. Rep.': 'Africa',
    "Cote d'Ivoire": 'Africa',
    'Curacao': 'Latin America',
    'Egypt. Arab Rep.': 'Africa',
    'Eswatini': 'Africa',
    'Faroe Islands': 'Europe',
    'French Polynesia': 'Oceania',
    'Gambia. The': 'Africa',
    'Gibraltar': 'Europe',
    'Greenland': 'North America',
    'Guam': 'Oceania',
    'Hong Kong SAR. China': 'Asia',
    'Iran. Islamic Rep.': 'Asia',
    'Isle of Man': 'Europe',
    "Korea. Dem. People's Rep.": 'Asia',
    'Korea. Rep.': 'Asia',
    'Lao PDR': 'Asia',
    'Macao SAR. China': 'Asia',
    'Micronesia. Fed. Sts.': 'Oceania',
    'New Caledonia': 'Oceania',
    'Northern Mariana Islands': 'Oceania',
    'Slovak Republic': 'Europe',
    'St. Kitts and Nevis': 'North America',
    'St. Lucia': 'North America',
    'St. Martin (French part)': 'North America',
    'St. Vincent and the Grenadines': 'North America',
    'Syrian Arab Republic': 'Asia',
    'Turks and Caicos Islands': 'North America',
    'Venezuela. RB': 'South America',
    'Virgin Islands (U.S.)': 'North America',
    'West Bank and Gaza': 'Asia',
    'Yemen. Rep.': 'Asia',
    'Channel Islands': 'Europe'
}

print("Manual mappings created for missing countries:")
print(f"Total mappings: {len(missing_country_mappings)}")

# Show the distribution of regions for missing countries
from collections import Counter
region_distribution = Counter(missing_country_mappings.values())
print("\nRegion distribution for missing countries:")
for region, count in region_distribution.items():
    print(f"{region}: {count}")

Manual mappings created for missing countries:
Total mappings: 44

Region distribution for missing countries:
Africa: 7
Asia: 11
North America: 12
Oceania: 6
Latin America: 2
Europe: 5
South America: 1


In [19]:
# Fill missing regions using the mapping
def fill_missing_regions(row):
    if pd.isna(row['Region']) and row['Receiving Country'] in missing_country_mappings:
        return missing_country_mappings[row['Receiving Country']]
    return row['Region']

# Apply the mapping
df['Region'] = df.apply(fill_missing_regions, axis=1)

# Verify the fix
print("After filling missing regions:")
print(f"Missing Region values: {df['Region'].isnull().sum()}")
print(f"Empty strings: {(df['Region'] == '').sum()}")

# Check if all missing countries were mapped
still_missing = df[df['Region'].isnull()]
if len(still_missing) > 0:
    print(f"\nStill missing regions for: {still_missing['Receiving Country'].unique()}")
else:
    print("\nAll regions successfully filled!")

# Show updated region distribution
print("\nUpdated region distribution:")
print(df['Region'].value_counts())

After filling missing regions:
Missing Region values: 0
Empty strings: 0

All regions successfully filled!

Updated region distribution:
Region
Asia             1653
Latin America     837
Africa            515
Europe            438
North America     414
Oceania           123
Name: count, dtype: int64


In [20]:
# Fix the remaining missing region for North Korea
df.loc[df['Receiving Country'] == "Korea. Dem. People's Rep.", 'Region'] = 'Asia'

# Final verification
print("Final verification:")
print(f"Missing Region values: {df['Region'].isnull().sum()}")
print(f"Empty strings: {(df['Region'] == '').sum()}")

# Show sample of previously missing entries that are now filled
sample_filled = df[df['Receiving Country'].isin(['Cabo Verde', 'Kyrgyz Republic', 'Puerto Rico', 'Hong Kong SAR. China', 'Venezuela. RB'])]
print("\nSample of filled entries:")
print(sample_filled[['Receiving Country', 'Region']].drop_duplicates().head(10))

print(f"\nTotal rows in dataset: {len(df)}")
print("All Region entries are now populated!")

Final verification:
Missing Region values: 0
Empty strings: 0

Sample of filled entries:
         Receiving Country         Region
782             Cabo Verde         Africa
1574       Kyrgyz Republic           Asia
2480           Puerto Rico  North America
2744  Hong Kong SAR. China           Asia
3107         Venezuela. RB  Latin America

Total rows in dataset: 3980
All Region entries are now populated!


In [21]:
# Check what's still missing
still_missing = df[df['Region'].isnull()]
print("Still missing:")
print(still_missing[['Receiving Country', 'Region']])

# Let's see the unique countries that still have null regions
if len(still_missing) > 0:
    print("\nUnique countries still missing regions:")
    print(still_missing['Receiving Country'].unique())
    
    # Fix them directly
    for country in still_missing['Receiving Country'].unique():
        print(f"Checking country: '{country}'")
        # Check if this is a spacing or encoding issue
        if 'Korea' in country:
            df.loc[df['Receiving Country'] == country, 'Region'] = 'Asia'
            
    # Re-check
    print(f"\nAfter final fix - Missing Region values: {df['Region'].isnull().sum()}")
else:
    print("No missing regions found!")

Still missing:
Empty DataFrame
Columns: [Receiving Country, Region]
Index: []
No missing regions found!


In [22]:
# Save the updated dataset
df.to_csv(r'C:\Users\clint\Desktop\RER\Code\14_updated.csv', index=False)

# Final summary
print("🎉 SUCCESS! All Region entries have been filled!")
print(f"\nFinal statistics:")
print(f"Total rows: {len(df)}")
print(f"Missing Region values: {df['Region'].isnull().sum()}")
print(f"Unique countries: {df['Receiving Country'].nunique()}")
print(f"Unique regions: {df['Region'].nunique()}")

print(f"\nFinal region distribution:")
print(df['Region'].value_counts())

print(f"\nDataset saved as '14_updated.csv' with all regions filled!")

# Show a few examples of the filled data
print(f"\nSample of filled data:")
sample_countries = ['Cabo Verde', 'Puerto Rico', 'Hong Kong SAR. China', 'Venezuela. RB', 'Korea. Dem. People\'s Rep.']
sample_data = df[df['Receiving Country'].isin(sample_countries)][['Receiving Country', 'Region']].drop_duplicates()
print(sample_data)

🎉 SUCCESS! All Region entries have been filled!

Final statistics:
Total rows: 3980
Missing Region values: 0
Unique countries: 214
Unique regions: 6

Final region distribution:
Region
Asia             1653
Latin America     837
Africa            515
Europe            438
North America     414
Oceania           123
Name: count, dtype: int64

Dataset saved as '14_updated.csv' with all regions filled!

Sample of filled data:
         Receiving Country         Region
782             Cabo Verde         Africa
2480           Puerto Rico  North America
2744  Hong Kong SAR. China           Asia
3107         Venezuela. RB  Latin America


In [23]:
# Convert South America to Latin America
print("Before conversion:")
print(df['Region'].value_counts())

# Replace South America with Latin America
df['Region'] = df['Region'].replace('South America', 'Latin America')

print("\nAfter conversion:")
print(df['Region'].value_counts())

# Save the updated dataset
df.to_csv(r'C:\Users\clint\Desktop\RER\Code\14_updated.csv', index=False)
print("\nDataset saved with South America converted to Latin America!")

Before conversion:
Region
Asia             1653
Latin America     837
Africa            515
Europe            438
North America     414
Oceania           123
Name: count, dtype: int64

After conversion:
Region
Asia             1653
Latin America     837
Africa            515
Europe            438
North America     414
Oceania           123
Name: count, dtype: int64

Dataset saved with South America converted to Latin America!


In [24]:
# Save the updated dataset
df.to_csv(r'C:\Users\clint\Desktop\RER\Code\18.csv', index=False)

# Final summary
print("🎉 SUCCESS! All Region entries have been filled!")
print(f"\nFinal statistics:")
print(f"Total rows: {len(df)}")
print(f"Missing Region values: {df['Region'].isnull().sum()}")
print(f"Unique countries: {df['Receiving Country'].nunique()}")
print(f"Unique regions: {df['Region'].nunique()}")

print(f"\nFinal region distribution:")
print(df['Region'].value_counts())

print(f"\nDataset saved as '14_updated.csv' with all regions filled!")

# Show a few examples of the filled data
print(f"\nSample of filled data:")
sample_countries = ['Cabo Verde', 'Puerto Rico', 'Hong Kong SAR. China', 'Venezuela. RB', 'Korea. Dem. People\'s Rep.']
sample_data = df[df['Receiving Country'].isin(sample_countries)][['Receiving Country', 'Region']].drop_duplicates()
print(sample_data)

🎉 SUCCESS! All Region entries have been filled!

Final statistics:
Total rows: 3980
Missing Region values: 0
Unique countries: 214
Unique regions: 6

Final region distribution:
Region
Asia             1653
Latin America     837
Africa            515
Europe            438
North America     414
Oceania           123
Name: count, dtype: int64

Dataset saved as '14_updated.csv' with all regions filled!

Sample of filled data:
         Receiving Country         Region
782             Cabo Verde         Africa
2480           Puerto Rico  North America
2744  Hong Kong SAR. China           Asia
3107         Venezuela. RB  Latin America


In [25]:
# Filter dataset to include only Africa and Latin America
print("Original dataset:")
print(f"Total rows: {len(df)}")
print("\nRegion distribution before filtering:")
print(df['Region'].value_counts())

# Filter for Africa and Latin America only
df_filtered = df[df['Region'].isin(['Africa', 'Latin America'])].copy()

print(f"\nFiltered dataset (Africa and Latin America only):")
print(f"Total rows: {len(df_filtered)}")
print(f"Rows removed: {len(df) - len(df_filtered)}")

print("\nRegion distribution after filtering:")
print(df_filtered['Region'].value_counts())

print(f"\nUnique countries in filtered dataset:")
print(f"Africa: {df_filtered[df_filtered['Region'] == 'Africa']['Receiving Country'].nunique()} countries")
print(f"Latin America: {df_filtered[df_filtered['Region'] == 'Latin America']['Receiving Country'].nunique()} countries")

# Save the filtered dataset
df_filtered.to_csv(r'C:\Users\clint\Desktop\RER\Code\18_africa_latin_america.csv', index=False)
print(f"\nFiltered dataset saved as '14_africa_latin_america.csv'")

# Show sample data
print(f"\nSample of filtered data:")
print(df_filtered.head())

Original dataset:
Total rows: 3980

Region distribution before filtering:
Region
Asia             1653
Latin America     837
Africa            515
Europe            438
North America     414
Oceania           123
Name: count, dtype: int64

Filtered dataset (Africa and Latin America only):
Total rows: 1352
Rows removed: 2628

Region distribution after filtering:
Region
Latin America    837
Africa           515
Name: count, dtype: int64

Unique countries in filtered dataset:
Africa: 53 countries
Latin America: 22 countries

Filtered dataset saved as '14_africa_latin_america.csv'

Sample of filtered data:
  Sending Country Receiving Country  Year              Value          Unit  \
0         Algeria           Senegal  2021        0.183414825  USD millions   
1       Australia          Ethiopia  2020        13.59617511  USD millions   
2       Australia             Kenya  2024  184,497.099695719  USD millions   
3       Australia            Uganda  2022                 22  USD millions   
