# Machine Learning Analysis of World Bank Dataset

In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, accuracy_score

# Load the dataset
file_path = '/mnt/data/API_8_DS2_en_csv_v2_3654.csv'
data = pd.read_csv(file_path, skiprows=4)

# Display dataset structure
data.head()
    

## Step 1: Dataset Description

In [None]:

# Dataset Overview
print("Dataset Shape:", data.shape)
print("Dataset Info:")
data.info()

# Check for missing values
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("Missing Values:
", missing_values)

# Drop irrelevant columns
data_cleaned = data.drop(columns=['Unnamed: 68'])
data_cleaned.head()
    

## Step 2: Exploratory Data Analysis (EDA)

In [None]:

# Visualize missing data
sns.heatmap(data_cleaned.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.show()

# Analyze a single indicator
indicator = "SP.REG.BRTH.ZS"  # Completeness of birth registration (%)
indicator_data = data_cleaned[data_cleaned["Indicator Code"] == indicator]
indicator_data = indicator_data.set_index("Country Name").iloc[:, 4:-1]  # Focus on years

# Plot indicator trends
indicator_data.T.plot(figsize=(15, 6))
plt.title("Completeness of Birth Registration (%) Over Time")
plt.xlabel("Year")
plt.ylabel("% Completeness")
plt.legend([], frameon=False)
plt.show()
    

## Step 3: Prediction Using Regression (Linear Regression)

In [None]:

# Prepare data for regression
indicator_data = indicator_data.dropna(axis=1, how='any')  # Remove years with missing data
X = indicator_data.iloc[:, :-1].values  # All years except the last
y = indicator_data.iloc[:, -1].values  # The last year

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Linear Regression Mean Squared Error:", mse)
    

## Step 4: Clustering Analysis (k-Means)

In [None]:

# Prepare data for clustering
clustering_data = indicator_data.dropna(axis=0, how='any')  # Remove countries with missing data
clustering_data_scaled = (clustering_data - clustering_data.mean()) / clustering_data.std()  # Normalize data

# Apply k-Means
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(clustering_data_scaled)
clustering_data["Cluster"] = clusters

# Visualize Clusters
sns.pairplot(clustering_data, diag_kind='kde', hue="Cluster")
plt.show()
    

## Step 5: Classification Using Decision Trees

In [None]:

# Prepare data for classification
# Assume we classify based on completeness of birth registration
classification_data = indicator_data.copy()
classification_data["Category"] = pd.qcut(classification_data.iloc[:, -1], q=3, labels=["Low", "Medium", "High"])

X_class = classification_data.iloc[:, :-2].values  # Features: all years except the last two columns
y_class = classification_data["Category"].values  # Target: categories

# Train-test split
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_class, y_train_class)

# Predictions
y_pred_class = dt_model.predict(X_test_class)
accuracy = accuracy_score(y_test_class, y_pred_class)
print("Decision Tree Accuracy:", accuracy)
    

## Step 6: Summary of Results

In [None]:

# Summary of Results
print("Linear Regression MSE:", mse)
print("Decision Tree Classification Accuracy:", accuracy)
print("Clustering Results:
", clustering_data["Cluster"].value_counts())
    