In [36]:
import pandas as pd

countries = pd.read_csv("Countries1.csv")
country_codes = countries["Alpha2"].tolist() # Get country code column

In [37]:
import requests
from pathlib import Path

session = requests.Session()

base_url = "https://data.countrydata.iatistandard.org/output/web/xlsx/en"
out_dir = Path("data/country_xlsx")
out_dir.mkdir(parents=True, exist_ok=True)

failed = []

for code in country_codes:
    url = f"{base_url}/{code}.xlsx"
    out_path = out_dir / f"{code}.xlsx"

    if out_path.exists():
        continue

    try:
        r = session.get(url, timeout=60)
        r.raise_for_status()

        with open(out_path, "wb") as f:
            f.write(r.content)

    except requests.exceptions.HTTPError:
        failed.append(code)

print("Done.")
print("Failed codes:", failed)


Done.
Failed codes: ['JG', nan]


In [38]:
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
INPUT_DIR = Path("data/country_xlsx")
OUTPUT_DIR = Path("data/cleaned_country_xlsx")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Columns to remove
COLUMNS_TO_REMOVE = ['Humanitarian', 'Multi Country', 'URL', 'Value (EUR)', 'Value (Local currrency)']

# Expected date format for database
DATE_FORMAT = '%Y-%m-%d'


def clean_text_field(text): # Standardize text fields: trim whitespace, normalize encoding

    if pd.isna(text):
        return text

    text = str(text).strip()

    # Handle 'No data' or similar placeholders
    if text.lower() in ['no data', 'n/a', 'na', 'null', '']:
        return None

    return text


def normalize_country_name(country):
    # Normalize country spellings and names
    # Common corrections for misspellings

    if pd.isna(country):
        return country

    country = str(country).strip()

    # Dictionary of common misspellings/variations
    corrections = {
        'Turkiye': 'Turkey',
        'Türkiye': 'Turkey',
        'Ivory Coast': "Côte d'Ivoire",
        'Congo DRC': 'Democratic Republic of the Congo',
        'Congo Republic': 'Republic of the Congo',
    }

    for wrong, correct in corrections.items():
        if wrong.lower() in country.lower():
            return correct

    return country


def validate_numeric_values(df):
    # Validate that numeric fields have acceptable values
    # Remove/correct impossible values (e.g., negative values where positive expected)

    # Value columns should not be negative (assuming all transactions are positive)
    value_columns = ['Value (USD)']

    for col in value_columns:
        if col in df.columns:
            # Flag negative values
            negative_mask = df[col] < 0
            if negative_mask.any():
                print(f"  WARNING: Found {negative_mask.sum()} negative values in {col}")
                # Option 1: Set to absolute value
                # df.loc[negative_mask, col] = df.loc[negative_mask, col].abs()
                # Option 2: Set to NaN
                df.loc[negative_mask, col] = np.nan

    return df


def remove_duplicates(df): #Remove duplicate rows based on key columns

    # Define key columns that should be unique
    # Rows that are duplicated in all these columns will be considered duplicates
    key_columns = [
        'IATI Identifier',
        'Transaction Type',
        'Calendar Year',
        'Calendar Quarter',
        'Value (USD)'
    ]

    # Check columns exist
    existing_keys = [col for col in key_columns if col in df.columns]

    initial_count = len(df)
    df = df.drop_duplicates(subset=existing_keys, keep='first')
    removed = initial_count - len(df)

    if removed > 0:
        print(f"  Removed {removed} duplicate rows")

    return df


def clean_and_transform_data(file_path):
    # Main cleaning and transformation function for a single country file

    country_code = file_path.stem  # e.g., 'AF' from 'AF.xlsx'
    print(f"\nProcessing: {country_code}.xlsx")

    try:
        # Read the Excel file
        df = pd.read_excel(file_path)
        print(f"  Initial rows: {len(df)}")

        # Remove specified columns
        columns_to_drop = [col for col in COLUMNS_TO_REMOVE if col in df.columns]
        if columns_to_drop:
            df = df.drop(columns=columns_to_drop)
            print(f"  Removed columns: {', '.join(columns_to_drop)}")

        # Clean text fields
        text_columns = df.select_dtypes(include=['object']).columns
        for col in text_columns:
            df[col] = df[col].apply(clean_text_field)

        # Normalize country names
        if 'Recipient Country or Region' in df.columns:
            df['Recipient Country or Region'] = df['Recipient Country or Region'].apply(
                normalize_country_name
            )

        # Validate numeric values
        df = validate_numeric_values(df)

        # Remove duplicates
        df = remove_duplicates(df)

        # Handle missing values in key fields
        critical_fields = ['IATI Identifier', 'Transaction Type', 'Calendar Year']
        before_drop = len(df)
        df = df.dropna(subset=critical_fields)
        dropped = before_drop - len(df)
        if dropped > 0:
            print(f"Dropped {dropped} rows with missing critical fields")

        # Sort by date and identifier
        sort_columns = ['Calendar Year and Quarter', 'IATI Identifier']
        existing_sort = [col for col in sort_columns if col in df.columns]
        if existing_sort:
            df = df.sort_values(by=existing_sort)

        # Reset index
        df = df.reset_index(drop=True)

        # Save cleaned data
        output_file = OUTPUT_DIR / f"{country_code}_cleaned.csv"
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"  Final rows: {len(df)}")
        print(f"  Saved to: {output_file.name}")

        return {
            'country': country_code,
            'initial_rows': len(df),
            'final_rows': len(df),
            'success': True
        }

    except Exception as e:
        print(f"  ERROR: {str(e)}")
        return {
            'country': country_code,
            'success': False,
            'error': str(e)
        }

def main():
    # Main execution function

    print("="*60)
    print("IATI DATA CLEANING AND TRANSFORMATION")
    print("="*60)
    print(f"Input directory: {INPUT_DIR}")
    print(f"Output directory: {OUTPUT_DIR}")

    # Get all Excel files (country files)
    excel_files = list(INPUT_DIR.glob("*.xlsx"))

    if not excel_files:
        print(f"\nERROR: File not found: {excel_files}")
        return
    

    for file_path in excel_files:
        # Build expected cleaned filename
        cleaned_name = file_path.stem + "_cleaned.csv"
        cleaned_path = OUTPUT_DIR / cleaned_name

        if cleaned_path.exists():
            continue

        print(f"Processing file: {file_path.name}")
        clean_and_transform_data(file_path)

    print("\n" + "="*60)
    print("PROCESSING COMPLETE!")
    print("="*60)


if __name__ == "__main__":
    main()

IATI DATA CLEANING AND TRANSFORMATION
Input directory: data/country_xlsx
Output directory: data/cleaned_country_xlsx

PROCESSING COMPLETE!


In [39]:
# Configuration
INPUT_DIR = Path("data/cleaned_country_xlsx")
DIMENSION_DIR = INPUT_DIR / "dimensions"
FACT_DIR = INPUT_DIR / "facts"

# Create directories
DIMENSION_DIR.mkdir(parents=True, exist_ok=True)
FACT_DIR.mkdir(parents=True, exist_ok=True)

print("="*70)
print("BUILDING DIMENSIONAL MODEL FROM CLEANED DATA")
print("="*70)
print(f"Input: {INPUT_DIR}")
print(f"Dimensions output: {DIMENSION_DIR}")
print(f"Facts output: {FACT_DIR}\n")

# LOAD ALL CLEANED CSV FILES

print("="*70)
print("LOADING CLEANED CSV FILES")
print("="*70)

# Find all cleaned country CSV files
cleaned_files = list(INPUT_DIR.glob("*_cleaned.csv"))

if not cleaned_files:
    print("\nERROR: No cleaned CSV files found!")
    print("Please run the cleaning script first.")
    exit(1)

print(f"\nFound {len(cleaned_files)} cleaned files")

# Load all cleaned files into a list
all_cleaned_data = []
for file in cleaned_files[:10]:
    df = pd.read_csv(file)
    all_cleaned_data.append(df)
    print(f"  Loaded {file.name}: {len(df):,} rows")

print(f"\n Loaded {len(all_cleaned_data)} files successfully")

# Rest of the code is exactly the same as document 2...
# (Copy all the dimension creation functions and main() from document 2)

# DIMENSION TABLE CREATION FUNCTIONS

def create_dim_time(all_data_list):
    print("\nCreating dimTime DataFrame...")
    dates = pd.date_range(start='2000-01-01', end='2030-12-31', freq='D')

    dim_time = pd.DataFrame({
        'Time_Key': dates.strftime('%Y%m%d').astype(int),
        'Calendar_Year': dates.year,
        'Calendar_Quarter': 'Q' + dates.quarter.astype(str),
        'Year_Quarter_Label': dates.year.astype(str) + ' Q' + dates.quarter.astype(str)
    })

    output_file = DIMENSION_DIR / "dimTime.csv"
    dim_time.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_time):,} rows")
    return dim_time


def create_dim_aid_type(all_data_list):
    print("\nCreating dimAidType DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_aid_type = combined[['Aid Type']].drop_duplicates()
    dim_aid_type = dim_aid_type[dim_aid_type['Aid Type'].notna()]
    dim_aid_type = dim_aid_type.rename(columns={'Aid Type': 'Aid_Type'})
    dim_aid_type['Aid_Type_Key'] = range(1, len(dim_aid_type) + 1)
    dim_aid_type = dim_aid_type[['Aid_Type_Key', 'Aid_Type']]

    output_file = DIMENSION_DIR / "dimAidType.csv"
    dim_aid_type.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_aid_type):,} rows")
    return dim_aid_type


def create_dim_finance_type(all_data_list):
    print("\nCreating dimFinanceType DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_finance_type = combined[['Finance Type']].drop_duplicates()
    dim_finance_type = dim_finance_type[dim_finance_type['Finance Type'].notna()]
    dim_finance_type = dim_finance_type.rename(columns={'Finance Type': 'Finance_Category'})
    dim_finance_type['Finance_Type_Key'] = range(1, len(dim_finance_type) + 1)
    dim_finance_type = dim_finance_type[['Finance_Type_Key', 'Finance_Category']]

    output_file = DIMENSION_DIR / "dimFinanceType.csv"
    dim_finance_type.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_finance_type):,} rows")
    return dim_finance_type


def create_dim_flow_type(all_data_list):
    print("\nCreating dimFlowType DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_flow_type = combined[['Flow Type']].drop_duplicates()
    dim_flow_type = dim_flow_type[dim_flow_type['Flow Type'].notna()]
    dim_flow_type = dim_flow_type.rename(columns={'Flow Type': 'Flow_Category'})
    dim_flow_type['Flow_Type_Key'] = range(1, len(dim_flow_type) + 1)
    dim_flow_type = dim_flow_type[['Flow_Type_Key', 'Flow_Category']]

    output_file = DIMENSION_DIR / "dimFlowType.csv"
    dim_flow_type.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_flow_type):,} rows")
    return dim_flow_type


def create_dim_task(all_data_list):
    print("\nCreating dimTask DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_task = combined[['IATI Identifier', 'Title']].drop_duplicates()
    dim_task = dim_task[dim_task['IATI Identifier'].notna()]
    dim_task = dim_task.rename(columns={
        'IATI Identifier': 'IATI_Identifier',
        'Title': 'Title'
    })
    dim_task['Task_Key'] = range(1, len(dim_task) + 1)
    dim_task = dim_task[['Task_Key', 'Title', 'IATI_Identifier']]

    output_file = DIMENSION_DIR / "dimTask.csv"
    dim_task.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_task):,} rows")
    return dim_task


def create_dim_provider_org(all_data_list):
    print("\nCreating dimProviderOrg DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_provider_org = combined[['Provider Organisation', 'Provider Organisation Type']].drop_duplicates()
    dim_provider_org = dim_provider_org[dim_provider_org['Provider Organisation'].notna()]
    dim_provider_org = dim_provider_org.rename(columns={
        'Provider Organisation': 'Provider_Org',
        'Provider Organisation Type': 'Provider_Org_Type'
    })
    dim_provider_org['Provider_Org_Key'] = range(1, len(dim_provider_org) + 1)
    dim_provider_org = dim_provider_org[['Provider_Org_Key', 'Provider_Org', 'Provider_Org_Type']]

    output_file = DIMENSION_DIR / "dimProviderOrg.csv"
    dim_provider_org.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_provider_org):,} rows")
    return dim_provider_org


def create_dim_reporting_org(all_data_list):
    print("\nCreating dimReportingOrg DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_reporting_org = combined[[
        'Reporting Organisation',
        'Reporting Organisation Type',
        'Reporting Organisation Group'
    ]].drop_duplicates()
    dim_reporting_org = dim_reporting_org[dim_reporting_org['Reporting Organisation'].notna()]
    dim_reporting_org = dim_reporting_org.rename(columns={
        'Reporting Organisation': 'Reporting_Org',
        'Reporting Organisation Type': 'Reporting_Org_Type',
        'Reporting Organisation Group': 'Reporting_Org_Group'
    })
    dim_reporting_org['Reporting_Org_Key'] = range(1, len(dim_reporting_org) + 1)
    dim_reporting_org = dim_reporting_org[[
        'Reporting_Org_Key',
        'Reporting_Org',
        'Reporting_Org_Type',
        'Reporting_Org_Group'
    ]]

    output_file = DIMENSION_DIR / "dimReportingOrg.csv"
    dim_reporting_org.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_reporting_org):,} rows")
    return dim_reporting_org


def create_dim_recipient_country(all_data_list):
    print("\nCreating dimRecipientCountry DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_country = (
        combined[['Recipient Country or Region']]
        .drop_duplicates()
        .dropna()
        .rename(columns={'Recipient Country or Region': 'Country_Name'})
    )

    dim_country['iso_alpha2'] = None  # placeholder if not available

    dim_country['Population'] = None
    dim_country['Life_Expectancy'] = None
    dim_country['GDP_USD'] = None

    dim_country['Recipient_Country_Key'] = range(1, len(dim_country) + 1)

    dim_country = dim_country[[
        'Recipient_Country_Key',
        'Country_Name',
        'iso_alpha2',
        'Population',
        'Life_Expectancy',
        'GDP_USD'
    ]]

    output_file = DIMENSION_DIR / "dimRecipientCountry.csv"
    dim_country.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_country):,} rows")

    return dim_country



def create_dim_recipient_org(all_data_list, dim_country):
    print("\nCreating dimRecipientOrg DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_recipient_org = combined[[
        'Receiver Organisation',
        'Receiver Organisation Type',
        'Recipient Country or Region'
    ]].drop_duplicates()

    dim_recipient_org = dim_recipient_org[dim_recipient_org['Receiver Organisation'].notna()]
    dim_recipient_org = dim_recipient_org.rename(columns={
        'Receiver Organisation': 'Recipient_Org',
        'Receiver Organisation Type': 'Recipient_Org_Type'
    })

    dim_recipient_org = dim_recipient_org.merge(
        dim_country[['Recipient_Country_Key', 'Country_Name']],
        left_on='Recipient Country or Region',
        right_on='Country_Name',
        how='left'
    ).drop(columns=['Recipient Country or Region', 'Country_Name'])

    dim_recipient_org['Recipient_Org_Key'] = range(1, len(dim_recipient_org) + 1)
    dim_recipient_org = dim_recipient_org[[
        'Recipient_Org_Key',
        'Recipient_Org',
        'Recipient_Org_Type',
        'Recipient_Country_Key'
    ]]

    output_file = DIMENSION_DIR / "dimRecipientOrg.csv"
    dim_recipient_org.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_recipient_org):,} rows")
    return dim_recipient_org


def create_dim_sector(all_data_list):
    print("\nCreating dimSector DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_sector = combined[['Sector Category']].drop_duplicates()
    dim_sector = dim_sector[dim_sector['Sector Category'].notna()]
    dim_sector = dim_sector.rename(columns={'Sector Category': 'Sector_Category'})
    dim_sector['Sector_Name'] = dim_sector['Sector_Category'].str.extract(r'^(\d+)')

    dim_sector['Sector_Key'] = range(1, len(dim_sector) + 1)
    dim_sector = dim_sector[['Sector_Key', 'Sector_Name', 'Sector_Category']]

    output_file = DIMENSION_DIR / "dimSector.csv"
    dim_sector.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_sector):,} rows")
    return dim_sector


def create_dim_sub_sector(all_data_list, dim_sector):
    print("\nCreating dimSubSector DataFrame...")
    combined = pd.concat(all_data_list, ignore_index=True)

    dim_sub_sector = combined[['Sector', 'Sector Category']].drop_duplicates()
    dim_sub_sector = dim_sub_sector[dim_sub_sector['Sector'].notna()]
    dim_sub_sector = dim_sub_sector.rename(columns={
        'Sector': 'Sub_Sector_Name',
        'Sector Category': 'Sub_Sector_Category'
    })

    dim_sub_sector = dim_sub_sector.merge(
        dim_sector[['Sector_Key', 'Sector_Category']],
        left_on='Sub_Sector_Category',
        right_on='Sector_Category',
        how='left'
    ).drop(columns=['Sector_Category'])

    dim_sub_sector['Sub_Sector_Key'] = range(1, len(dim_sub_sector) + 1)
    dim_sub_sector = dim_sub_sector[[
        'Sub_Sector_Key',
        'Sub_Sector_Name',
        'Sub_Sector_Category',
        'Sector_Key'
    ]]

    output_file = DIMENSION_DIR / "dimSubSector.csv"
    dim_sub_sector.to_csv(output_file, index=False)
    print(f"   Created DataFrame with {len(dim_sub_sector):,} rows")
    return dim_sub_sector


def create_fact_aid_transactions(all_data_list, dimensions):
    print("\n" + "="*70)
    print("CREATING FACT TABLE: factAidTransactions")
    print("="*70)

    fact_table = pd.concat(all_data_list, ignore_index=True)
    print(f"\nCombined {len(fact_table):,} transactions")

    # Split year and quarter
    year_q = fact_table['Calendar Year and Quarter'].str.split(" ", expand=True)

    fact_table['Time_Key'] = (
        year_q[0] + year_q[1].str.replace("Q", "")
    ).astype(int)

    print("\nMapping foreign keys...")

    fact_table = fact_table.merge(
        dimensions['aid_type'][['Aid_Type_Key', 'Aid_Type']],
        left_on='Aid Type', right_on='Aid_Type', how='left'
    ).drop(columns=['Aid_Type'])

    fact_table = fact_table.merge(
        dimensions['finance_type'][['Finance_Type_Key', 'Finance_Category']],
        left_on='Finance Type', right_on='Finance_Category', how='left'
    ).drop(columns=['Finance_Category'])

    fact_table = fact_table.merge(
        dimensions['flow_type'][['Flow_Type_Key', 'Flow_Category']],
        left_on='Flow Type', right_on='Flow_Category', how='left'
    ).drop(columns=['Flow_Category'])

    fact_table = fact_table.merge(
        dimensions['task'][['Task_Key', 'IATI_Identifier']],
        left_on='IATI Identifier', right_on='IATI_Identifier', how='left'
    ).drop(columns=['IATI_Identifier'])

    fact_table = fact_table.merge(
        dimensions['provider_org'][['Provider_Org_Key', 'Provider_Org']],
        left_on='Provider Organisation', right_on='Provider_Org', how='left'
    ).drop(columns=['Provider_Org'])

    fact_table = fact_table.merge(
        dimensions['recipient_org'][['Recipient_Org_Key', 'Recipient_Org']],
        left_on='Receiver Organisation', right_on='Recipient_Org', how='left'
    ).drop(columns=['Recipient_Org'])

    fact_table = fact_table.merge(
        dimensions['reporting_org'][['Reporting_Org_Key', 'Reporting_Org']],
        left_on='Reporting Organisation', right_on='Reporting_Org', how='left'
    ).drop(columns=['Reporting_Org'])

    fact_table = fact_table.merge(
        dimensions['sub_sector'][['Sub_Sector_Key', 'Sub_Sector_Name']],
        left_on='Sector', right_on='Sub_Sector_Name', how='left'
    ).drop(columns=['Sub_Sector_Name'])

    fact_table['Aid_Fact_Key'] = range(1, len(fact_table) + 1)

    fact_table = fact_table[[
        'Aid_Fact_Key',
        'Sub_Sector_Key',
        'Task_Key',
        'Reporting_Org_Key',
        'Provider_Org_Key',
        'Recipient_Org_Key',
        'Aid_Type_Key',
        'Finance_Type_Key',
        'Flow_Type_Key',
        'Time_Key',
        'Value (USD)'
    ]]

    fact_table = fact_table.rename(columns={'Value (USD)': 'Value_USD'})

    output_file = FACT_DIR / "factAidTransactions.csv"
    fact_table.to_csv(output_file, index=False)

    print(f"\n Created fact table DataFrame with {len(fact_table):,} rows")
    return fact_table


# MAIN EXECUTION

print("\n" + "="*70)
print("CREATING DIMENSION TABLES")
print("="*70)

dimensions = {}

dimensions['time'] = create_dim_time(all_cleaned_data)
dimensions['aid_type'] = create_dim_aid_type(all_cleaned_data)
dimensions['finance_type'] = create_dim_finance_type(all_cleaned_data)
dimensions['flow_type'] = create_dim_flow_type(all_cleaned_data)
dimensions['task'] = create_dim_task(all_cleaned_data)
dimensions['provider_org'] = create_dim_provider_org(all_cleaned_data)
dimensions['reporting_org'] = create_dim_reporting_org(all_cleaned_data)
dimensions['country'] = create_dim_recipient_country(all_cleaned_data)
dimensions['recipient_org'] = create_dim_recipient_org(all_cleaned_data, dimensions['country'])
dimensions['sector'] = create_dim_sector(all_cleaned_data)
dimensions['sub_sector'] = create_dim_sub_sector(all_cleaned_data, dimensions['sector'])

print(f"\n Created {len(dimensions)} dimension tables")

combined_data = pd.concat(all_cleaned_data, ignore_index=True)

# Create fact table
fact_table = create_fact_aid_transactions(all_cleaned_data, dimensions)

print("\n" + "="*70)
print("COMPLETE!")
print("="*70)
print(f"\n All tables created in:")
print(f"  Dimensions: {DIMENSION_DIR}")
print(f"  Facts: {FACT_DIR}")

BUILDING DIMENSIONAL MODEL FROM CLEANED DATA
Input: data/cleaned_country_xlsx
Dimensions output: data/cleaned_country_xlsx/dimensions
Facts output: data/cleaned_country_xlsx/facts

LOADING CLEANED CSV FILES

Found 230 cleaned files
  Loaded UY_cleaned.csv: 18,455 rows
  Loaded CN_cleaned.csv: 144,561 rows
  Loaded LC_cleaned.csv: 11,282 rows
  Loaded JP_cleaned.csv: 2,803 rows
  Loaded IM_cleaned.csv: 24 rows
  Loaded TJ_cleaned.csv: 54,464 rows
  Loaded MP_cleaned.csv: 329 rows
  Loaded PW_cleaned.csv: 4,884 rows
  Loaded GL_cleaned.csv: 219 rows
  Loaded JO_cleaned.csv: 105,701 rows

 Loaded 10 files successfully

CREATING DIMENSION TABLES

Creating dimTime DataFrame...
   Created DataFrame with 11,323 rows

Creating dimAidType DataFrame...
   Created DataFrame with 18 rows

Creating dimFinanceType DataFrame...
   Created DataFrame with 12 rows

Creating dimFlowType DataFrame...
   Created DataFrame with 7 rows

Creating dimTask DataFrame...
   Created DataFrame with 19,579 rows

Cre

In [40]:
%pip install sqlalchemy pyodbc
!brew install unixodbc
!brew install msodbcsql18

#import the necessary library
from sqlalchemy import create_engine


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
To reinstall 2.3.14, run:
  brew reinstall unixodbc
[34m==>[0m [1mSearching for similarly named formulae and casks...[0m
[31mError:[0m No formulae or casks found for msodbcsql18.


In [41]:
from sqlalchemy import create_engine

username = 'sa'
password = 'MyStr0ngPwd!2026'
host = 'localhost'
port = 1433
database = 'financial_aid_db'

connection_url = f"mssql+pymssql://{username}:{password}@{host}:{port}/{database}"

engine = create_engine(connection_url)

try:
    with engine.connect() as conn:
        print("Connection successful!")

        dimensions['time'].to_sql('dim_time', con=conn, if_exists='replace', index=False)
        dimensions['aid_type'].to_sql('dim_aid_type', con=conn, if_exists='replace', index=False)
        dimensions['finance_type'].to_sql('dim_finance_type', con=conn, if_exists='replace', index=False)
        dimensions['flow_type'].to_sql('dim_flow_type', con=conn, if_exists='replace', index=False)
        dimensions['task'].to_sql('dim_task', con=conn, if_exists='replace', index=False)
        dimensions['provider_org'].to_sql('dim_provider_org', con=conn, if_exists='replace', index=False)
        dimensions['reporting_org'].to_sql('dim_reporting_org', con=conn, if_exists='replace', index=False)
        dimensions['country'].to_sql('dim_recipient_country', con=conn, if_exists='replace', index=False)
        dimensions['recipient_org'].to_sql('dim_recipient_org', con=conn, if_exists='replace', index=False)
        dimensions['sector'].to_sql('dim_sector', con=conn, if_exists='replace', index=False)
        dimensions['sub_sector'].to_sql('dim_sub_sector', con=conn, if_exists='replace', index=False)

        fact_table.to_sql('fact_aid_transactions', con=conn, if_exists='replace', index=False)

        print("All tables exported successfully!")

except Exception as e:
    print(f"Connection failed: {e}")


Connection successful!
All tables exported successfully!


In [42]:
len(fact_table)

2227873