# Data Cleaning - Interest Rates (Taux d'intérêt)

This notebook analyzes and cleans the interest rates data for 38 countries.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [None]:
# Load the interest rates CSV file
df_interest = pd.read_csv('../data/raw/taux_interet_38_pays.csv')

# Display first rows to see the structure
df_interest.head(10)

In [None]:
# Basic information about the dataset
print("Dataset shape (rows, columns):", df_interest.shape)
print("\nColumn names:")
print(df_interest.columns.tolist())
print("\nData types:")
print(df_interest.dtypes)

In [None]:
# Check for missing values in each column
print("Missing values per column:")
print(df_interest.isnull().sum())
print("\nTotal missing values:", df_interest.isnull().sum().sum())
print("\nPercentage of missing values:")
print((df_interest.isnull().sum() / len(df_interest) * 100).round(2))

In [None]:
# Descriptive statistics for all years
print("Descriptive statistics (all numeric columns):")
df_interest.describe()

In [None]:
# Check which countries have missing values
print("Countries with missing values:")
countries_with_missing = df_interest[df_interest.isnull().any(axis=1)]
countries_with_missing

## Step 1: Apply Interpolation to Fill Missing Values

In [None]:
# Create a copy of the dataframe to work with
df_interest_cleaned = df_interest.copy()

# Apply linear interpolation for each country (row by row)
# axis=1 means we interpolate across columns (years)
# limit_direction='both' fills gaps at the beginning and end too
df_interest_cleaned.iloc[:, 1:] = df_interest_cleaned.iloc[:, 1:].interpolate(axis=1, method='linear', limit_direction='both')

# Display countries that had missing values to see the interpolation results
print("Countries that had missing values (now filled):")
countries_to_check = ['United States', 'Canada', 'Sweden', 'Netherlands', 'Switzerland', 
                      'Australia', 'Japan', 'Singapore', 'Norway', 'India', 
                      'Thailand', 'Philippines', 'Chile', 'Argentina', 'Pakistan']

df_interest_cleaned[df_interest_cleaned['Country'].isin(countries_to_check)]

In [None]:
# Verify no missing values remain
print("Missing values after interpolation:")
print(df_interest_cleaned.isnull().sum().sum())

# Show summary
if df_interest_cleaned.isnull().sum().sum() == 0:
    print("\n✓ All missing values have been filled!")

## Step 2: Save Cleaned Data

In [None]:
# Create processed folder if it doesn't exist
import os
os.makedirs('../data/processed', exist_ok=True)

# Save the cleaned interest rates data
df_interest_cleaned.to_csv('../data/processed/taux_interet_cleaned.csv', index=False)
print("✓ Cleaned data saved to: data/processed/taux_interet_cleaned.csv")

---
# Inflation Data Cleaning

In [None]:
# Load inflation data
df_inflation = pd.read_csv('../data/raw/inflation_38_pays.csv')

print("Missing values before interpolation:", df_inflation.isnull().sum().sum())

# Apply interpolation
df_inflation_cleaned = df_inflation.copy()
df_inflation_cleaned.iloc[:, 1:] = df_inflation_cleaned.iloc[:, 1:].interpolate(axis=1, method='linear', limit_direction='both')

print("Missing values after interpolation:", df_inflation_cleaned.isnull().sum().sum())

# Save cleaned data
df_inflation_cleaned.to_csv('../data/processed/inflation_cleaned.csv', index=False)
print("✓ Cleaned inflation data saved")

---
# Remaining Economic Indicators Cleaning

In [None]:
# Clean remaining CSV files in batch
files = [
    'chomage_38_pays.csv',
    'croissance_pib_38_pays.csv',
    'dette_publique_38_pays.csv',
    'solde_budgetaire_38_pays.csv',
    'balance_compte_courant_38_pays.csv',
    'reserves_change_38_pays.csv'
]

for file in files:
    # Load CSV
    df = pd.read_csv(f'../data/raw/{file}')
    
    # Apply interpolation
    df_cleaned = df.copy()
    df_cleaned.iloc[:, 1:] = df_cleaned.iloc[:, 1:].interpolate(axis=1, method='linear', limit_direction='both')
    
    # Save cleaned file
    output_name = file.replace('_38_pays.csv', '_cleaned.csv')
    df_cleaned.to_csv(f'../data/processed/{output_name}', index=False)
    
    print(f'✓ {file} cleaned and saved')

print('\n✓ All economic indicators cleaned!')

---
# Credit Ratings Cleaning and Conversion

In [None]:
# Load credit ratings
df_ratings = pd.read_csv('../data/raw/notations_credit_38_pays.csv')

print("Missing values:", df_ratings.isnull().sum().sum())

# Create mapping: AAA=1 (best) to CC=20 (worst)
rating_map = {
    'AAA': 1, 'AA+': 2, 'AA': 3, 'AA-': 4,
    'A+': 5, 'A': 6, 'A-': 7,
    'BBB+': 8, 'BBB': 9, 'BBB-': 10,
    'BB+': 11, 'BB': 12, 'BB-': 13,
    'B+': 14, 'B': 15, 'B-': 16,
    'CCC+': 17, 'CCC': 18, 'CCC-': 19, 'CC': 20
}

# Save text version
df_ratings.to_csv('../data/processed/notations_credit_cleaned.csv', index=False)

# Convert to numerical version
df_ratings_numerical = df_ratings.copy()
for col in df_ratings_numerical.columns[1:]:
    df_ratings_numerical[col] = df_ratings_numerical[col].map(rating_map)

df_ratings_numerical.to_csv('../data/processed/notations_credit_numerical.csv', index=False)

print("✓ Credit ratings cleaned and converted to numbers")
print("✓ Text version: notations_credit_cleaned.csv")
print("✓ Numerical version: notations_credit_numerical.csv")

In [None]:
# Display the full dataset to see all values
df_interest