In [None]:
import pandas as pd

# Load the data
elec_df = pd.read_csv('electricity_production.csv')

# Show the structure
print("Dataset shape:", elec_df.shape)
print("\nFirst few rows:")
print(elec_df.head())
print("\nAll indicators:")
print(elec_df['Indicator'].tolist())

In [None]:
import pandas as pd

# Load data
elec_df = pd.read_csv('electricity_production.csv')

# Complete list of all 54 African countries
all_african_countries = [
    'Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi',
    'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros',
    'Congo', 'Congo DR', 'Côte d\'Ivoire', 'Djibouti', 'Egypt',
    'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia',
    'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia',
    'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius',
    'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda',
    'São Tomé and Príncipe', 'Senegal', 'Seychelles', 'Sierra Leone',
    'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania',
    'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe'
]

# Possible name variations in World Bank data
name_variations = {
    'Côte d\'Ivoire': ['Cote d\'Ivoire', 'Côte d\'Ivoire'],
    'Egypt': ['Egypt', 'Egypt, Arab Rep.'],
    'Congo': ['Congo', 'Congo, Rep.'],
    'Congo DR': ['Congo, Dem. Rep.', 'Democratic Republic of the Congo'],
    'Eswatini': ['Eswatini', 'Swaziland'],
    'Somalia': ['Somalia', 'Federal Republic of Somalia'],
    'Gambia': ['Gambia', 'Gambia, The']
}

# Get all columns with country names
all_columns = elec_df.columns.tolist()
dataset_countries = []
for col in all_columns:
    if '(' in col and ')' in col:  # Has year format like "Country (2024)"
        country_name = col.split(' (')[0]
        dataset_countries.append((country_name, col))

# Find which African countries are in the dataset
found_countries = []
found_columns = []

for african_country in all_african_countries:
    # Check direct match
    for dataset_country, column_name in dataset_countries:
        if dataset_country == african_country:
            found_countries.append(african_country)
            found_columns.append(column_name)
            break
        # Check variations
        elif african_country in name_variations:
            if dataset_country in name_variations[african_country]:
                found_countries.append(african_country)
                found_columns.append(column_name)
                break

# Show results
print(f"=== AFRICAN COUNTRIES FOUND IN DATASET ===")
print(f"Total: {len(found_countries)} out of 54")
print(f"\nCountries:")
for i, country in enumerate(sorted(found_countries), 1):
    print(f"{i}. {country}")

print(f"\n=== MISSING AFRICAN COUNTRIES ===")
missing = [c for c in all_african_countries if c not in found_countries]
print(f"Total: {len(missing)}")
print(sorted(missing))

# Save the list for the ML code
print(f"\n=== COLUMN NAMES TO USE ===")
print(found_columns[:10], "...")  # Show first 10

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import altair as alt

# ============================================================================
# STEP 1: Load data and get African countries
# ============================================================================

elec_df = pd.read_csv('electricity_production.csv')

# List of 46 African country columns we found
african_columns = [
    'Angola (2024)', 'Benin (2024)', 'Botswana (2023)', 'Burkina Faso (2024)',
    'Burundi (2025)', 'Cabo Verde (2024)', 'Cameroon (2024)', 'Chad (2023)',
    'Comoros (2025)', "Cote d'Ivoire (2023)", 'Djibouti (2013)',
    'Equatorial Guinea (2024)', 'Eritrea (2009)', 'Eswatini (2024)',
    'Ethiopia (2015)', 'Gabon (2009)', 'Ghana (2023)', 'Guinea (2025)',
    'Guinea-Bissau (2025)', 'Kenya (2018)', 'Lesotho (2023)', 'Liberia (2025)',
    'Madagascar (2022)', 'Malawi (2025)', 'Mali (2024)', 'Mauritania (2014)',
    'Mauritius (2023)', 'Morocco (2023)', 'Mozambique (2018)', 'Namibia (2024)',
    'Niger (2017)', 'Nigeria (2025)', 'Rwanda (2023)', 'Senegal (2024)',
    'Seychelles (2023)', 'Sierra Leone (2023)', 'Federal Republic of Somalia (2025)',
    'South Africa (2020)', 'South Sudan (2024)', 'Sudan (2014)', 'Tanzania (2023)',
    'Togo (2023)', 'Tunisia (2024)', 'Uganda (2013)', 'Zambia (2019)',
    'Zimbabwe (2016)'
]

print(f"Using {len(african_columns)} African countries")

# ============================================================================
# STEP 2: Extract data for each country
# ============================================================================

data = []

for country_col in african_columns:
    country_name = country_col.split(' (')[0]

    row = {
        'country': country_name,
        'outages_pct': elec_df.iloc[0][country_col],
        'outages_num': elec_df.iloc[1][country_col],
        'outages_duration': elec_df.iloc[2][country_col],
        'losses_pct': elec_df.iloc[3][country_col],
        'generator_pct': elec_df.iloc[4][country_col]
    }
    data.append(row)

df = pd.DataFrame(data)

for col in ['outages_pct', 'outages_num', 'outages_duration', 'losses_pct', 'generator_pct']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna()

print(f"Complete data for {len(df)} countries")

# ============================================================================
# STEP 3: Prepare for Machine Learning
# ============================================================================

X = df[['outages_pct', 'outages_num', 'outages_duration', 'losses_pct']].values
y = df['generator_pct'].values

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()

# ============================================================================
# STEP 4: Train Model
# ============================================================================

model = LinearRegression()
model.fit(X_scaled, y_scaled)

y_pred_scaled = model.predict(X_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

r2 = r2_score(y, y_pred)

print(f"\n=== MODEL PERFORMANCE ===")
print(f"R² Score: {r2:.3f}")

df['predicted_generator_pct'] = y_pred

# ============================================================================
# STEP 5: Create Visualization with Hover Effect
# ============================================================================

# Create hover selection
hover = alt.selection_single(
    on='mouseover',
    empty='none',
    clear='mouseout'
)

# Scatter plot with conditional size
scatter = alt.Chart(df).mark_circle().encode(
    x=alt.X('generator_pct:Q',
            title='Actual Generator Ownership (%)',
            scale=alt.Scale(domain=[0, 100])),
    y=alt.Y('predicted_generator_pct:Q',
            title='Predicted Generator Ownership (%)',
            scale=alt.Scale(domain=[0, 100])),
    color=alt.value('#C75DAB'),
    size=alt.condition(
        hover,
        alt.value(300),
        alt.value(100)
    ),
    opacity=alt.condition(
        hover,
        alt.value(1),
        alt.value(0.7)
    ),
    tooltip=[
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('generator_pct:Q', title='Actual (%)', format='.1f'),
        alt.Tooltip('predicted_generator_pct:Q', title='Predicted (%)', format='.1f'),
        alt.Tooltip('outages_pct:Q', title='Firms that experience Outages (%)', format='.1f')
    ]
).add_selection(
    hover
).properties(
    width=400,
    height=400,
    title={
        "text": "Machine Learning: Predicting Generator Ownership from Grid Reliability",
        "subtitle": f"Linear Regression (R² = {r2:.3f}) | {len(df)} African Countries | Source: World Bank Enterprise Surveys (2024)"
    }
)

# Diagonal line
line_data = pd.DataFrame({'x': [0, 100], 'y': [0, 100]})
line = alt.Chart(line_data).mark_line(strokeDash=[5, 5], color='gray').encode(
    x='x:Q', y='y:Q'
)

# Combine
final_chart = (line + scatter).configure(
    background='#FFFBF5'
).configure_title(
    offset=30,
    anchor='start',
    subtitleFontWeight='lighter',
    subtitleColor='black'
)

# Save
final_chart.save('chart_ml_africa.json')
df.to_csv('ml_results_africa.csv', index=False)

print(f"\n✓ Saved: chart_ml_africa.json")
print(f"✓ Saved: ml_results_africa.csv")

# Display
final_chart