In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
file = r"/content/Life Expectancy Data.csv"
data = pd.read_csv(file)
data.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define the target and features
target = 'Life expectancy '
features = [col for col in data.columns if col != target]

# Separate features and target
X = data[features]
y = data[target]

# Drop rows where the target is missing
data_cleaned = data.dropna(subset=[target])
X = data_cleaned[features]
y = data_cleaned[target]

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Handle numerical features
# - Imputation
numerical_imputer = SimpleImputer(strategy='mean')
X_train_num = pd.DataFrame(numerical_imputer.fit_transform(X_train[numerical_cols]), columns=numerical_cols, index=X_train.index)
X_test_num = pd.DataFrame(numerical_imputer.transform(X_test[numerical_cols]), columns=numerical_cols, index=X_test.index)

# - Scaling
scaler = StandardScaler()
X_train_num = pd.DataFrame(scaler.fit_transform(X_train_num), columns=numerical_cols, index=X_train.index)
X_test_num = pd.DataFrame(scaler.transform(X_test_num), columns=numerical_cols, index=X_test.index)

# Handle categorical features
# - Imputation
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train_cat = pd.DataFrame(categorical_imputer.fit_transform(X_train[categorical_cols]), columns=categorical_cols, index=X_train.index)
X_test_cat = pd.DataFrame(categorical_imputer.transform(X_test[categorical_cols]), columns=categorical_cols, index=X_test.index)

# - One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_encoded = pd.DataFrame(
    encoder.fit_transform(X_train_cat),
    columns=encoder.get_feature_names_out(categorical_cols),
    index=X_train.index
)
X_test_cat_encoded = pd.DataFrame(
    encoder.transform(X_test_cat),
    columns=encoder.get_feature_names_out(categorical_cols),
    index=X_test.index
)

# Combine preprocessed numerical and categorical features
X_train_processed = pd.concat([X_train_num, X_train_cat_encoded], axis=1)
X_test_processed = pd.concat([X_test_num, X_test_cat_encoded], axis=1)

# Train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_processed, y_train)

# Make predictions
y_pred_lr = linear_model.predict(X_test_processed)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Print results
print(f'Mean Squared Error: {mse_lr:.2f}')
print(f'Root Mean Squared Error: {rmse_lr:.2f}')
print(f'R Squared: {r2_lr:.2f}')


Mean Squared Error: 4.32
Root Mean Squared Error: 2.08
R Squared: 0.95


In [16]:
# Calculate tolerance-based accuracy
tolerance = 10  # Define the acceptable range
accurate_predictions = np.abs(y_test - y_pred_lr) <= tolerance
accuracy = np.mean(accurate_predictions) * 100

# Print results
print(f"Tolerance-Based Accuracy: {accuracy:.2f}%")


Tolerance-Based Accuracy: 100.00%


In [18]:
from sklearn.neighbors import KNeighborsRegressor
# - KNN Regressor
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_processed, y_train)

# - Predictions
y_pred_knn = knn_model.predict(X_test_processed)

# - Evaluate the Model
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)  # Root Mean Squared Error
r2_knn = r2_score(y_test, y_pred_knn)

# - Print Results
print(f'Mean Squared Error For KNN: {mse_knn}')
print(f'Root Mean Squared Error For KNN: {rmse_knn}')
print(f'R Squared For KNN: {r2_knn}')

Mean Squared Error For KNN: 9.554748122866897
Root Mean Squared Error For KNN: 3.0910755608472105
R Squared For KNN: 0.893057131747402
