<a href="https://colab.research.google.com/github/Vargheseshre/DSNouveau/blob/main/WHO_All_Cause_Mortality_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.express as px

#| Library             | Key Functionality                        | R Equivalent      |
#| ------------------- | ---------------------------------------- | ----------------- |
#| `pandas`            | Data wrangling and manipulation          | `dplyr`, `readr`  |
#| `numpy`             | Numerical computation                    | `base::numeric`   |
#| `matplotlib.pyplot` | Basic plotting                           | `ggplot2`, `plot` |
#| `seaborn`           | Statistical visualization                | `ggplot2`         |
#| `statsmodels.api`   | Statistical modeling (regression, tests) | `lm()`, `glm()`   |
#| `plotly.express`    | Interactive plots                        | `plotly` in R     |

# Load the CSV
df = pd.read_csv("C:/Morticd10_part6.csv") # path removed for privacy reason

# Explore the data
print(df.info())
print(df.describe())
print(df.head())

df_filtered = df[df['Country'] == 1400].copy()

# Compute total_deaths from all columns starting with 'Deaths'
death_columns = [col for col in df_filtered.columns if col.startswith("Deaths")]
df_filtered['total_deaths'] = df_filtered[death_columns].sum(axis=1, skipna=True)

# Map sex
df_filtered['sex'] = df_filtered['Sex'].map({1: 'Male', 2: 'Female'})

yearly_trend = df_filtered.groupby('Year')['total_deaths'].sum().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_trend, x='Year', y='total_deaths', marker='o')
plt.title("All-Cause Mortality Trend (Country Code 1400)")
plt.xlabel("Year")
plt.ylabel("Total Deaths")
plt.grid(True)
plt.tight_layout()
plt.show()

# All Cause Mortality
df['total_deaths'] = df[death_columns].sum(axis=1, skipna=True)

global_trend = df.groupby('Year')['total_deaths'].sum().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(data=global_trend, x='Year', y='total_deaths', marker='o')
plt.title("Global All-Cause Mortality Trend")
plt.xlabel("Year")
plt.ylabel("Total Deaths")
plt.grid(True)
plt.tight_layout()
plt.show()

# Countries with full data for 3 years
years_per_country = df.groupby('Country')['Year'].nunique().reset_index()
complete_countries = years_per_country[years_per_country['Year'] == 3]['Country']

df_complete = df[df['Country'].isin(complete_countries)].copy()
df_complete['total_deaths'] = df_complete[death_columns].sum(axis=1, skipna=True)

trend_complete = df_complete.groupby('Year')['total_deaths'].sum().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_complete, x='Year', y='total_deaths', marker='o')
plt.title("All-Cause Mortality Trend (Only Countries with Complete Data)")
plt.xlabel("Year")
plt.ylabel("Total Deaths")
plt.grid(True)
plt.tight_layout()
plt.show()

#Model Building
# Add constant for intercept
X = sm.add_constant(trend_complete['Year'])
y = trend_complete['total_deaths']

# Fit model
model = sm.OLS(y, X).fit()

# Print summary
print(model.summary())


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Shreya.Varghese/Downloads/morticd10_part6/Morticd10_part6.csv'