In [1]:
# 📘 Capstone Project: Gender Differences in NCD Mortality Rates
# Analysis Notebook (Google Colab Compatible)

# ✅ 1. IMPORT LIBRARIES
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

sns.set(style="whitegrid")


In [None]:
# ✅ 2. LOAD DATA
df = pd.read_csv("ncd_gender_cleaned.csv")  # Replace with your actual file path if needed
df.head()

In [None]:
# ✅ 3. CLEANING & TRANSFORMATION
# Fix Sex Codes if still in raw format
df['Sex'] = df['Sex'].replace({'SEX_MLE': 'Male', 'SEX_FMLE': 'Female', 'SEX_BTSX': 'Both'})

# Keep only necessary columns
cols = ['Country', 'Year', 'Sex', 'NCD_Mortality_Rate']
df = df[cols].dropna()
df['Year'] = df['Year'].astype(int)
df.head()

In [None]:
# ✅ 4. EXPLORATORY DATA ANALYSIS
# Descriptive Stats
print(df.groupby("Sex")["NCD_Mortality_Rate"].describe())

# NCD Trend by Sex Over Time
plt.figure(figsize=(10,6))
sns.lineplot(data=df, x="Year", y="NCD_Mortality_Rate", hue="Sex")
plt.title("Trend of NCD Mortality Rate by Sex")
plt.show()

# Gender Difference by Region for Latest Year
latest = df['Year'].max()
df_latest = df[df['Year'] == latest]

pivot = df_latest.pivot(index='Country', columns='Sex', values='NCD_Mortality_Rate').dropna()
pivot['Difference'] = pivot['Male'] - pivot['Female']
pivot_sorted = pivot.sort_values('Difference', ascending=False)

# Bar Chart of Gender Difference
plt.figure(figsize=(12,6))
sns.barplot(x=pivot_sorted.index, y=pivot_sorted['Difference'])
plt.xticks(rotation=45)
plt.title(f"Gender Gap in NCD Mortality ({latest})")
plt.ylabel("Male - Female")
plt.show()

In [None]:
# ✅ 5. MODELING: KMeans Clustering
# Prepare data for clustering
pivot_kmeans = df.pivot_table(index='Country', columns='Sex', values='NCD_Mortality_Rate').dropna()
scaler = StandardScaler()
scaled = scaler.fit_transform(pivot_kmeans)

kmeans = KMeans(n_clusters=3, random_state=42)
pivot_kmeans['Cluster'] = kmeans.fit_predict(scaled)

# Silhouette Score
score = silhouette_score(scaled, pivot_kmeans['Cluster'])
print(f"Silhouette Score: {score:.2f}")

# Scatter Plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=pivot_kmeans, x='Male', y='Female', hue='Cluster', palette='Set1')
plt.title("Country Clustering by Gendered NCD Mortality")
plt.show()


In [None]:
# ✅ 6. EXPORT CLEANED DATA (Optional)
df.to_csv("ncd_gender_cleaned.csv", index=False)

In [None]:
# ✅ 7. CONCLUSION
print("Analysis complete. Gender disparities in NCD mortality vary by country and cluster analysis reveals patterns.")
