In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [None]:
df=pd.read_csv('Life Expectancy Data.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['Country'].value_counts().unique()

In [None]:
df['Status'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.countplot(data=df,x=df['Status'],)

In [None]:
sns.barplot(data=df,x=df['Status'],y=df['Life expectancy '])

In [None]:
sns.barplot(data=df,x=df['Status'],y=df['infant deaths'])

In [None]:
plt.figure(figsize=(15,15))
sns.lineplot(data=df,x=df[' BMI '],y=df['infant deaths'])
plt.title('BMI vs Infant Deaths')
plt.xlabel('BMI')
plt.ylabel('infantDied')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_relation(df, x_col, y_col, kind='line', figsize=(10, 6)):
  
    plt.figure(figsize=figsize)
    
    if kind == 'scatter':
        sns.scatterplot(data=df, x=x_col, y=y_col,ci=None)
    elif kind == 'line':
        sns.lineplot(data=df, x=x_col, y=y_col)
    elif kind == 'reg':
        sns.regplot(data=df, x=x_col, y=y_col, line_kws={"color": "red"})
    else:
        raise ValueError("kind must be 'scatter', 'line', or 'reg'")
    
    plt.title(f'{x_col} vs {y_col}')
    plt.grid(True)
    plt.show()


In [None]:
 # now df is correct
print(df.columns)


In [None]:
plot_relation(df,x_col=' BMI ',y_col='Life expectancy ')

In [None]:
print("Max BMI:", df[' BMI '].max())
print("Min BMI:", df[' BMI '].min())
print(df[' BMI '].describe())


In [None]:
plot_relation(df,x_col=df['Alcohol'],y_col=df['Life expectancy '])

In [None]:
df.corr(numeric_only=True)['Life expectancy '].sort_values(ascending=False)


In [None]:
df.describe()

In [None]:
df

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
num_df = df.select_dtypes(include=['int64', 'float64'])

# Step 2: Loop through columns and impute based on skewness
for col in num_df.columns:
    if num_df[col].isnull().sum() > 0:
        skew_val = num_df[col].skew()
        strategy = 'median' if abs(skew_val) > 0.5 else 'mean'
        
        imputer = SimpleImputer(strategy=strategy)
        df[[col]] = imputer.fit_transform(num_df[[col]])

In [None]:
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_feature_vs_life(df, feature, target='Life expectancy '):
    plt.figure(figsize=(10,6))
    sns.regplot(x=df[feature], y=df[target], scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
    plt.title(f'{feature} vs {target}', fontsize=14)
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.grid(True)
    plt.show()


In [None]:
plot_feature_vs_life(df, 'Hepatitis B')
plot_feature_vs_life(df, 'Alcohol')
plot_feature_vs_life(df, 'GDP')


In [None]:
import statsmodels.api as sm
sm.qqplot(df['Alcohol'], line='s')

In [None]:
sns.lineplot(data=df,x='Year',y='Life expectancy ',ci=None)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Population', y='GDP', data=df)

# Format x-axis with commas
plt.ticklabel_format(style='plain', axis='x')
plt.gca().get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.xlabel("Population")
plt.ylabel("GDP")
plt.title("GDP vs Population")

plt.tight_layout()
plt.show()


In [None]:
print(df['Population'].head(10))
print(df['Population'].describe())


In [None]:
sns.lineplot(data=df,x=df['Population'],y=df['GDP'])

In [None]:
def plot_distribution(df, column):
    import seaborn as sns
    import matplotlib.pyplot as plt

    plt.figure(figsize=(8, 5))
    sns.histplot(df[column], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution Plot for {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()


In [None]:
temp_df=df.drop(['Country','Year','Status'],axis=1)
for col in temp_df.columns:
    plot_distribution(df,col)

In [None]:
def plot_box(df, column):
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=df[column], color='orange')
    plt.title(f'Boxplot for {column}')
    plt.xlabel(column)
    plt.show()

In [None]:
for col in temp_df.columns:
    plot_box(df,col)

In [None]:
def plot_scatter(df, x_col, y_col):
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=df[x_col], y=df[y_col], hue=df[y_col], palette='viridis')
    plt.title(f'Scatter Plot: {x_col} vs {y_col}')
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.show()

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
from itertools import combinations
for x_col, y_col in combinations(numeric_df.columns, 2):
    plot_scatter(numeric_df, x_col, y_col)

In [None]:
sns.heatmap(numeric_df.corr())

# Before SCaling the data

In [None]:
df['Country'].unique()

In [None]:
df.drop(['Country','Status','Year'],axis=1,inplace=True)

In [None]:
df

In [None]:
X=df.drop(['Life expectancy '],axis=1)
Y=df['Life expectancy ']

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42) 

In [None]:
models = {
    LinearRegression(): "LinearRegression",
    DecisionTreeRegressor(max_depth=4): "DecisionTreeRegressor",
    RandomForestRegressor(max_depth=4): "RandomForestRegressor"
}

In [None]:
for model in models: 
    model.fit(X_train,Y_train)
    Y_pred_train=model.predict(X_train)
    Y_pred_test=model.predict(X_test)

    print(f"the r2 score of the model:{model} of the training   is",r2_score(Y_train,Y_pred_train))
    print(f"the mae of the model {model} is of the training is",mean_absolute_error(Y_train,Y_pred_train))
    print(f"the mse of the model {model} is of the training is ",mean_squared_error(Y_train,Y_pred_train))    
    print('------------------')
    print(f"the r2 score of the model:{model} of the test   is",r2_score(Y_test,Y_pred_test))
    print(f"the mae of the model {model} is of the test is",mean_absolute_error(Y_test,Y_pred_test))
    print(f"the mse of the model {model} is of the test is ",mean_squared_error(Y_test,Y_pred_test))   

    print('')
    print('--------------------')

# SO after checking the model R2_score and teh mae we are going to use the VOTING REGRESSOR 

In [None]:
vc = VotingRegressor(estimators=[
    ('LinearRegression', LinearRegression()),
    ('DecisionTreeRegressor', DecisionTreeRegressor(max_depth=4)),
    ('RandomForestRegressor', RandomForestRegressor(max_depth=4))
])

In [None]:
vc.fit(X_train,Y_train)

In [None]:
Y_pred_train_vc=vc.predict(X_train)
Y_pred_test_vc=vc.predict(X_test)

In [None]:
print(f"the r2 score of the model voting regsoor on the training   is",r2_score(Y_train,Y_pred_train_vc))
print(f"the mae of the model voting regsoor on the training   is",mean_absolute_error(Y_train,Y_pred_train_vc))
print(f"the mse of the model voting regsoor on the training   is",mean_squared_error(Y_train,Y_pred_train_vc))    
print('------------------')
print(f"the r2 score of themodel voting regsoor on the testing  is",r2_score(Y_test,Y_pred_test_vc))
print(f"the mae of the model voting regsoor on the testing  is",mean_absolute_error(Y_test,Y_pred_test_vc))
print(f"the mse of the model voting regsoor on the testing  is",mean_squared_error(Y_test,Y_pred_test_vc))  

In [None]:
scores1 = cross_val_score(vc, X, Y, cv=5, scoring='r2')
print(f"Cross-validated R² for {scores1}: {scores2.mean():.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
param_grids = {
    "DecisionTreeRegressor": {
        "max_depth": [2, 4, 6, 8, 10],
        "min_samples_split": [2, 5, 10]
    },
    "RandomForestRegressor": {
        "n_estimators": [50, 100],
        "max_depth": [2, 4, 6, 8, 10],
        "min_samples_split": [2, 5, 10]
    }
}


In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor()
}


In [None]:
best_models = {}
for name, model in models.items():
    print(f"\n🔍 Tuning and training: {name}")
    
    if name in param_grids:
        grid = GridSearchCV(model, param_grids[name], cv=5, scoring='r2', n_jobs=-1)
        grid.fit(X_train, Y_train)
        best_model = grid.best_estimator_
        print("Best Params:", grid.best_params_)
    else:
        model.fit(X_train, Y_train)
        best_model = model
    
    best_models[name] = best_model

    # Predictions
    Y_pred_train = best_model.predict(X_train)
    Y_pred_test = best_model.predict(X_test)

    # Metrics
    print(f"Train R²: {r2_score(Y_train, Y_pred_train):.4f}")
    print(f"Train MAE: {mean_absolute_error(Y_train, Y_pred_train):.4f}")
    print(f"Train MSE: {mean_squared_error(Y_train, Y_pred_train):.4f}")
    print('------------------')
    print(f"Test R²: {r2_score(Y_test, Y_pred_test):.4f}")
    print(f"Test MAE: {mean_absolute_error(Y_test, Y_pred_test):.4f}")
    print(f"Test MSE: {mean_squared_error(Y_test, Y_pred_test):.4f}")
    print('--------------------')

In [None]:
best_tree = DecisionTreeRegressor(max_depth=10, min_samples_split=5)
best_rf = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=2)
lr = LinearRegression()  


vc2 = VotingRegressor(estimators=[
    ('LinearRegression', lr),
    ('DecisionTree', best_tree),
    ('RandomForest', best_rf)
])
vc2.fit(X_train, Y_train)

Y_pred_train_vc2 = vc2.predict(X_train)
Y_pred_test_vc2 = vc2.predict(X_test)

In [None]:
print(f"the r2 score of the model voting regsoor on the training   is",r2_score(Y_train,Y_pred_train_vc2))
print(f"the mae of the model voting regsoor on the training   is",mean_absolute_error(Y_train,Y_pred_train_vc2))
print(f"the mse of the model voting regsoor on the training   is",mean_squared_error(Y_train,Y_pred_train_vc2))    
print('------------------')
print(f"the r2 score of themodel voting regsoor on the testing  is",r2_score(Y_test,Y_pred_test_vc2))
print(f"the mae of the model voting regsoor on the testing  is",mean_absolute_error(Y_test,Y_pred_test_vc2))
print(f"the mse of the model voting regsoor on the testing  is",mean_squared_error(Y_test,Y_pred_test_vc2))  

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor()
}

for name, model in models.items():
    scores = cross_val_score(model, X, Y, cv=5, scoring='r2')
    print(f"Cross-validated R² for {name}: {scores.mean():.4f}")


In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(max_depth=10, min_samples_split=10),
    "RandomForestRegressor": RandomForestRegressor(max_depth=10, min_samples_split=2, n_estimators=100)
}

for name, model in models.items():
    scores = cross_val_score(model, X, Y, cv=5, scoring='r2')
    print(f"Cross-validated R² for {name}: {scores.mean():.4f}")

In [None]:
scores2 = cross_val_score(vc2, X, Y, cv=5, scoring='r2')
print(f"Cross-validated R² for {scores2}: {scores2.mean():.4f}")

# Now we are going to scale the data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc=StandardScaler()

In [None]:
X_train_new=sc.fit_transform(X_train)
X_test_new=sc.transform(X_test)

In [None]:
X_train_new=pd.DataFrame(X_train_new,columns=X.columns)
X_test_new=pd.DataFrame(X_test_new,columns=X.columns)

In [None]:
models = {
    LinearRegression(): "LinearRegression",
    DecisionTreeRegressor(max_depth=4): "DecisionTreeRegressor",
    RandomForestRegressor(max_depth=4): "RandomForestRegressor"
}
for model in models: 
    model.fit(X_train_new,Y_train)
    Y_pred_train=model.predict(X_train_new)
    Y_pred_test=model.predict(X_test_new)

    print(f"the r2 score of the model:{model} of the training   is",r2_score(Y_train,Y_pred_train))
    print(f"the mae of the model {model} is of the training is",mean_absolute_error(Y_train,Y_pred_train))
    print(f"the mse of the model {model} is of the training is ",mean_squared_error(Y_train,Y_pred_train))    
    print('------------------')
    print(f"the r2 score of the model:{model} of the test   is",r2_score(Y_test,Y_pred_test))
    print(f"the mae of the model {model} is of the test is",mean_absolute_error(Y_test,Y_pred_test))
    print(f"the mse of the model {model} is of the test is ",mean_squared_error(Y_test,Y_pred_test))   

    print('')
    print('--------------------')

In [None]:
best_tree = DecisionTreeRegressor(max_depth=10, min_samples_split=5)
best_rf = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=2)
lr = LinearRegression()  


vc3 = VotingRegressor(estimators=[
    ('LinearRegression', lr),
    ('DecisionTree', best_tree),
    ('RandomForest', best_rf)
])
vc3.fit(X_train_new, Y_train)

Y_pred_train_vc3 = vc3.predict(X_train_new)
Y_pred_test_vc3 = vc3.predict(X_test_new)

In [None]:
print(f"the r2 score of the model voting regsoor on the training   is",r2_score(Y_train,Y_pred_train_vc3))
print(f"the mae of the model voting regsoor on the training   is",mean_absolute_error(Y_train,Y_pred_train_vc3))
print(f"the mse of the model voting regsoor on the training   is",mean_squared_error(Y_train,Y_pred_train_vc3))    
print('------------------')
print(f"the r2 score of themodel voting regsoor on the testing  is",r2_score(Y_test,Y_pred_test_vc2))
print(f"the mae of the model voting regsoor on the testing  is",mean_absolute_error(Y_test,Y_pred_test_vc2))
print(f"the mse of the model voting regsoor on the testing  is",mean_squared_error(Y_test,Y_pred_test_vc2))  

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Store models and their names
models = {
    LinearRegression(): "LinearRegression",
    DecisionTreeRegressor(max_depth=4): "DecisionTreeRegressor",
    RandomForestRegressor(max_depth=4): "RandomForestRegressor"
}

# Create a dictionary to store fitted models
fitted_models = {}

# Loop to fit and evaluate
for model in models:
    model.fit(X_train_new, Y_train)
    fitted_models[models[model]] = model  # Store by name

    Y_pred_train = model.predict(X_train_new)
    Y_pred_test = model.predict(X_test_new)

    print(f"--- {models[model]} ---")
    print(f"Train R2: {r2_score(Y_train, Y_pred_train):.4f}")
    print(f"Train MAE: {mean_absolute_error(Y_train, Y_pred_train):.4f}")
    print(f"Train MSE: {mean_squared_error(Y_train, Y_pred_train):.4f}")
    print(f"Test R2: {r2_score(Y_test, Y_pred_test):.4f}")
    print(f"Test MAE: {mean_absolute_error(Y_test, Y_pred_test):.4f}")
    print(f"Test MSE: {mean_squared_error(Y_test, Y_pred_test):.4f}")
    print("----------------------\n")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Get feature names
feature_names = X_train_new.columns

# Random Forest Regressor feature importances
rfc = fitted_models['RandomForestRegressor']
rfc_importances = pd.Series(rfc.feature_importances_, index=feature_names)
rfc_importances.sort_values(ascending=False).plot(kind='bar', figsize=(12,6), title="Random Forest Feature Importances")
plt.ylabel("Importance Score")
plt.show()

# Decision Tree Regressor feature importances
dtc = fitted_models['DecisionTreeRegressor']
dtc_importances = pd.Series(dtc.feature_importances_, index=feature_names)
dtc_importances.sort_values(ascending=False).plot(kind='bar', figsize=(12,6), title="Decision Tree Feature Importances", color='orange')
plt.ylabel("Importance Score")
plt.show()


## ✅ Final Thoughts and What I’ve Learned

After working on the Life Expectancy dataset, I’ve gained a lot of insights into how different socio-economic and health factors contribute to how long people live in a country. This project wasn’t just about building models — it helped me truly understand the data and its underlying patterns.

---

### 🔍 What I Understood from the Data

- The dataset had quite a few **missing values**, and I learned how important it is to handle them wisely. I used **mean** or **median imputation** based on the skewness of each feature.
- Scaling features using **StandardScaler** helped improve the performance of models like Linear Regression.
- I noticed strong correlations and patterns between life expectancy and other variables — especially **HIV/AIDS**, **Adult Mortality**, and **Income Composition of Resources**.

---

### 📊 Key Insights I Discovered

- **HIV/AIDS rate** has one of the most negative impacts on life expectancy. Reducing its prevalence could dramatically improve a country’s life span.
- **Income Composition of Resources** — basically a measure of wealth distribution and access — had a positive effect on life expectancy.
- **Schooling** and **BMI** were also positively related to a longer life.
- **Adult Mortality** is a very intuitive but powerful feature — higher mortality means lower life expectancy.

---

### 🤖 My Model Building Journey

I tried three different models:
- Linear Regression
- Decision Tree Regressor
- Random Forest Regressor

Then I combined them using a **Voting Regressor** to improve accuracy by blending their strengths. After testing both **scaled and unscaled data**, I got the best results with scaling.

**Final Model (Voting Regressor - Scaled Data):**
- **R² Score on Training Set**: ~0.89
- **R² Score on Test Set**: ~0.87

These results show that the model learned well and didn’t just memorize the training data.

---

### 🧠 What I Learned Overall

- Ensemble models like **Random Forest** are quite powerful and reliable.
- Handling missing data, scaling features, and understanding feature importance can drastically impact model performance.
- **Data visualization** and **SHAP plots** helped me understand what the model was thinking and why it made certain predictions.

---

### 💡 What I’d Like to Improve or Do Next

- Explore more advanced models like **XGBoost** or **LightGBM**.
- Fine-tune hyperparameters using **RandomizedSearchCV** or **GridSearchCV**.
- Try **SHAP** (SHapley Additive exPlanations) for deeper interpretability of feature contributions.
- Add external factors (like country-level policy indicators) to enrich the dataset.

---

This project taught me a lot more than just how to build a machine learning model — it helped me **connect data science to real-world outcomes**, especially in health and development. 🚀
