In [1]:

##🐍 Python Data Cleaning Pipeline

import pandas as pd
import numpy as np

# 1) Load dataset
df = pd.read_csv("world_economics.csv")
df
print("Shape:", df.shape)
print(df.head())


##2) Standardize column names


df.columns = (
    df.columns.str.strip()         # remove extra spaces
             .str.lower()          # lowercase
             .str.replace(" ", "_")
             .str.replace(r"[%()/]", "", regex=True)  # remove %, (), /
)

print(df.columns)
print(df.tail())



Shape: (173, 19)
            name              currency           capital   languages  \
0        Tunisia        Tunisian dinar             Tunis      Arabic   
1        Vietnam       Vietnamese đồng             Hanoi  Vietnamese   
2        Ecuador  United States dollar             Quito     Spanish   
3    Puerto Rico  United States dollar          San Juan     English   
4  United States  United States dollar  Washington, D.C.     English   

    latitude   longitude       area    region           subregion  \
0  34.000000    9.000000   163610.0    Africa     Northern Africa   
1  16.166667  107.833333   331212.0      Asia  South-Eastern Asia   
2  -2.000000  -77.500000   276841.0  Americas       South America   
3  18.250000  -66.500000     8870.0  Americas           Caribbean   
4  38.000000  -97.000000  9372610.0  Americas       North America   

                 borders    GDP  GDP Growth  Interest Rate  Inflation Rate  \
0         ['DZA', 'LBY']     53        1.80            7.

In [2]:

##
# 3) Clean numeric columns

# Columns expected to be numeric
num_cols = ["gdp", "gdp_growth", "interest_rate", "inflation_rate",
            "jobless_rate", "gov_budget", "debt_gdp",
            "current_account", "population", "area", "latitude", "longitude"]

for col in num_cols:
    if col in df.columns:
        df[col] = (df[col]
                     .astype(str)              # convert to string
                     .str.replace(",", "", regex=False)  # remove commas
                     .str.replace("%", "", regex=False)  # remove %
                     .str.replace("$", "", regex=False)  # remove $
                     .str.strip()
                     .replace(["", "nan", "None", "NaN", "N/A"], np.nan) # unify nulls
                  )
        df[col] = pd.to_numeric(df[col], errors="coerce")


In [3]:
##4) Handle missing values


# Replace empty strings with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

# Check null counts
print(df.isnull().sum())


name                0
currency            0
capital             1
languages           0
latitude            0
longitude           0
area                0
region              0
subregion           0
borders            24
gdp                 0
gdp_growth         72
interest_rate      11
inflation_rate      1
jobless_rate        2
gov._budget         7
debtgdp             8
current_account     6
population          0
dtype: int64


In [4]:
##5) Remove duplicates

df = df.drop_duplicates(subset=["name"])  # country name assumed unique


In [5]:
##6) Standardize text columns


text_cols = ["name", "currency", "capital", "languages", "region", "subregion", "borders"]

for col in text_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.title().replace("Nan", np.nan)


In [6]:
##7) Add derived column → GDP per capita


df["gdp_per_capita"] = np.where(
    (df["population"].notnull()) & (df["population"] > 0),
    df["gdp"] / df["population"],
    np.nan
)


In [7]:
##8) Final check


print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             173 non-null    object 
 1   currency         173 non-null    object 
 2   capital          172 non-null    object 
 3   languages        173 non-null    object 
 4   latitude         173 non-null    float64
 5   longitude        173 non-null    float64
 6   area             173 non-null    float64
 7   region           173 non-null    object 
 8   subregion        173 non-null    object 
 9   borders          149 non-null    object 
 10  gdp              173 non-null    int64  
 11  gdp_growth       101 non-null    float64
 12  interest_rate    162 non-null    float64
 13  inflation_rate   172 non-null    float64
 14  jobless_rate     171 non-null    float64
 15  gov._budget      166 non-null    float64
 16  debtgdp          165 non-null    float64
 17  current_account 

In [8]:
##9) Save cleaned dataset


df.to_csv("cleaned_world_economics.csv", index=False)
print("✅ Cleaned file saved: cleaned_world_economics.csv")


✅ Cleaned file saved: cleaned_world_economics.csv
