In [None]:
import pandas as pd
import plotly.express as px

In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/life-expectancy-and-socio-economic-world-bank/life expectancy.csv')

# Display the first few rows of the dataset
df.head()

In [None]:
# Remove rows with missing values in the "Education Expenditure %" column
df_cleaned = df.dropna(subset=["Education Expenditure %"])

# Create a scatter plot on a world map
fig = px.scatter_geo(
    df_cleaned, 
    locations="Country Code", 
    color="Life Expectancy World Bank",
    size="Education Expenditure %",
    hover_name="Country Name",
    projection="natural earth",
    title="Impact of Expenditure on Health and Education on Life Expectancy"
)

# Customize the appearance
fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.update_layout(geo=dict(
    showcoastlines=True,
))

# Show the plot
fig.show()

In [None]:
# Remove rows with missing values in the "Education Expenditure %" column
df_cleaned = df.dropna(subset=["Health Expenditure %"])

# Create a scatter plot on a world map
fig = px.scatter_geo(
    df_cleaned, 
    locations="Country Code", 
    color="Life Expectancy World Bank",
    size="Health Expenditure %",
    hover_name="Country Name",
    projection="natural earth",
    title="Impact of Expenditure on Health and Education on Life Expectancy"
)

# Customize the appearance
fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.update_layout(geo=dict(
    showcoastlines=True,
))

# Show the plot
fig.show()

In [None]:
correlation_health = df_cleaned['Health Expenditure %'].corr(df_cleaned['Life Expectancy World Bank'])
correlation_education = df_cleaned['Education Expenditure %'].corr(df_cleaned['Life Expectancy World Bank'])

print(f"Correlation between Health Expenditure and Life Expectancy: {correlation_health:.2f}")
print(f"Correlation between Education Expenditure and Life Expectancy: {correlation_education:.2f}")

In [None]:
# Remove rows with missing values in the "Unemployment" column
df_cleaned = df.dropna(subset=["Unemployment"])

# Create a scatter plot on a world map
fig = px.scatter_geo(
    df_cleaned, 
    locations="Country Code", 
    color="Prevelance of Undernourishment",
    size="Unemployment",
    hover_name="Country Name",
    projection="natural earth",
    title="% share of the labor force that is without work but available for and seeking employment"
)

# Customize the appearance
fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.update_layout(geo=dict(
    showcoastlines=True,
))

# Show the plot
fig.show()

In [None]:
# Remove rows with missing values in the "Education Expenditure %" column
df_cleaned = df.dropna(subset=["Corruption"])

# Create a scatter plot on a world map
fig = px.scatter_geo(
    df_cleaned, 
    locations="Country Code", 
    color="Life Expectancy World Bank",
    size="Corruption",
    hover_name="Country Name",
    projection="natural earth",
    title="Corruption(CPIA rating)"
)

# Customize the appearance
fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.update_layout(geo=dict(
    showcoastlines=True,
))

# Show the plot
fig.show()

In [None]:
correlation_corruption = df_cleaned['Corruption'].corr(df_cleaned['Life Expectancy World Bank'])

print(f"Correlation between Corruption and Life Expectancy: {correlation_corruption:.2f}")

In [None]:
# Remove rows with missing values in the "Unemployment" column
df_cleaned = df.dropna(subset=["Sanitation"])

# Create a scatter plot on a world map
fig = px.scatter_geo(
    df_cleaned, 
    locations="Country Code", 
    color="CO2",
    size="Sanitation",
    hover_name="Country Name",
    projection="natural earth",
    title="Sanitation"
)

# Customize the appearance
fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.update_layout(geo=dict(
    showcoastlines=True,
))

# Show the plot
fig.show()

In [None]:
correlation_sanitation = df_cleaned['Sanitation'].corr(df_cleaned['CO2'])

print(f"Correlation between Corruption and Life Expectancy: {correlation_sanitation:.2f}")

In [None]:
# Remove rows with missing values in the "Unemployment" column
df_cleaned = df.dropna(subset=["Injuries"])

# Create a scatter plot on a world map
fig = px.scatter_geo(
    df_cleaned, 
    locations="Country Code", 
    color="Health Expenditure %",
    size="Injuries",
    hover_name="Country Name",
    projection="natural earth",
    title="Injuries"
)

# Customize the appearance
fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white")
fig.update_layout(geo=dict(
    showcoastlines=True,
))

# Show the plot
fig.show()

In [None]:
correlation_injuries = df_cleaned['Injuries'].corr(df_cleaned['Health Expenditure %'])

print(f"Correlation between Corruption and Life Expectancy: {correlation_injuries:.2f}")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [None]:
# Assuming you have already loaded your data into the 'df' DataFrame
df = pd.read_csv(filepath_or_buffer='/kaggle/input/life-expectancy-and-socio-economic-world-bank/life expectancy.csv')

# Remove rows with missing target values
df = df.dropna(subset=['Life Expectancy World Bank'])

In [None]:
# Create feature matrix X and target variable y
X = df[['Health Expenditure %', 'Education Expenditure %', 'Prevelance of Undernourishment', 'CO2', 'Unemployment']]
y = df['Life Expectancy World Bank']

In [None]:
# Impute missing target values using mean (for example)
target_imputer = SimpleImputer(strategy='mean')
y = target_imputer.fit_transform(y.values.reshape(-1, 1))

In [None]:
# Handle missing values with SimpleImputer for the feature matrix X
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

In [None]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

In [None]:
# Coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

print("Coefficients:")
for feature, coef in zip(df.columns[1:-1], coefficients[0]):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {intercept[0]:.2f}")