In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
LED=pd.read_csv('Life Expectancy Data.csv')
LED.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [5]:
#Convert Country and Status values into numerical using label encoding

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform 'Country' column
LED['Country'] = label_encoder.fit_transform(LED['Country'])

# Fit and transform 'Status' column
LED['Status'] = label_encoder.fit_transform(LED['Status'])

LED.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0,2015,1,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,0,2014,1,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,0,2013,1,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,0,2012,1,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,0,2011,1,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [6]:
# Drop rows with any null values
LED.dropna(inplace=True)

# Display the first few rows to verify the changes
LED.head()

# Check for null values again to confirm removal
LED.isna().sum()

Unnamed: 0,0
Country,0
Year,0
Status,0
Life expectancy,0
Adult Mortality,0
infant deaths,0
Alcohol,0
percentage expenditure,0
Hepatitis B,0
Measles,0


In [7]:
# Drop Life Expectancy from LED and store seperately in variable Y
X = LED.drop(columns=['Life expectancy '])
y = LED['Life expectancy ']

In [8]:
#check for rows and column in both x and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (1649, 21)
Shape of y: (1649,)


In [9]:
#split the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Multiple Linear Regression
# =======================================
MLR = LinearRegression()

# Train the model
MLR.fit(X_train, y_train)

y_pred_multiple = MLR.predict(X_test)

mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)

print("\nMultiple Linear Regression")
print(f"  Mean Squared Error: {mse_multiple:.4f}")
print(f"  R² Score: {r2_multiple:.4f}")



Multiple Linear Regression
  Mean Squared Error: 13.0270
  R² Score: 0.8166


In [12]:
# Print the intercept
print(round(MLR.intercept_,2))

281.26


In [13]:
#predict the values of Y for 1st 5 rows
MLR.predict(X[0:5])

array([62.52739286, 62.71055361, 62.72526582, 62.64505024, 62.1226279 ])

In [14]:
# Display the first actual and predicted variables
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_multiple})
print(predictions.head())

      Actual  Predicted
1210    67.5  71.834622
2273    73.8  72.663319
1005    79.1  80.821234
2926    54.9  54.328367
518     48.6  52.097605


In [15]:
#serialization
#model persistance (saving and loading trained models)
import pickle

In [17]:
# save: 'with' is package deals with file handeling, wb- write
with open('model.pkl','wb') as f:
    pickle.dump(MLR,f)

In [18]:
# load: rb-read
with open('model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [19]:
#model created as clf2 and values are passed into it
clf2.predict(X[0:5])

array([62.52739286, 62.71055361, 62.72526582, 62.64505024, 62.1226279 ])

# Interpretation: Multiple Linear Regression Analysis

**Model Overview:** A Multiple Linear Regression is developed to predict Life Expectancy of a person based on Multiple predictor variables. The dataset was pre-processed by converting categorical variables (Country and Status) into numerical format to ensure the compatibility with the regression model.

Model Parameters

a) **Intercept (𝛽0)** = 281.26, The Intercept represents the predicted life expectancy when all independent variables are zero, which is hypothetical in nature.

b) **R-Squared value** = 0.8166, The R-Squared Value indicates that the model explains 81.66% of the variability in Life Expectancy. This suggests a strong relationship between the predictor variables and the target variable.

c) **Mean Squared Error** = 13.0247, MSE represents the average squared difference between the actual and predicted life expectancy values. A lower MSE value generally is better for the model.