In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/Life Expectancy Data.csv")

In [None]:
df.head()

In [None]:
df.tail()

## **Sanity Check of Data**

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()  # Missing values

In [None]:
df.isnull().sum()/df.shape[0]*100  # Percentage of missing values

In [None]:
df.duplicated().sum()  # Duplicate values

In [None]:
# Garbage Valules
for i in df.select_dtypes(include='object').columns:
  # print(i)
  print(df[i].value_counts())
  print("***"*10)

## Exploratory Data Anylysis


In [None]:
df.describe().T

In [None]:
df.describe(include="object")

In [None]:
import warnings
warnings.filterwarnings("ignore")
for i in df.select_dtypes(include="number").columns:
  sns.histplot(data = df, x = i)
  plt.show()

In [None]:
import warnings
warnings.filterwarnings("ignore")
for i in df.select_dtypes(include="number").columns:
  sns.boxplot(data = df, x = i)
  plt.show()

In [None]:
df.select_dtypes(include="number").columns

In [None]:
for i in ['Year', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling']:
       sns.scatterplot(data = df, x=i, y="Life expectancy ")
       plt.show()

In [None]:
s = df.select_dtypes(include="number").corr()

In [None]:
plt.figure(figsize = (15,15))
sns.heatmap(s, annot=True )

# Missing Value Treatement

In [None]:
df.isnull().sum()

In [None]:
df.columns = df.columns.str.strip()


In [None]:
for i in ["BMI", "Polio", "Income composition of resources"]:
  df[i].fillna(df[i].median(), inplace=True)


In [None]:
df.isnull().sum()

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()

In [None]:
for i in df.select_dtypes(include="number").columns:
  df[i] = imputer.fit_transform(df[[i]])

In [None]:
df.isnull().sum()

# **Outlier** **Treatement**

In [None]:
# IQR Method
def wisker(col):
  q1, q3 = np.percentile(col, [25, 75])
  iqr = q3 - q1
  lower_bound = q1 - (1.5 * iqr)
  upper_bound = q3 + (1.5 * iqr)
  return lower_bound, upper_bound

In [None]:
wisker(df['GDP'])

In [None]:
for i in ['GDP', 'Total expenditure', 'thinness  1-19 years', 'thinness 5-9 years']:
  lower_bound, upper_bound = wisker(df[i])
  df[i] = np.where(df[i] > upper_bound, upper_bound, df[i])

In [None]:
for i in ['GDP', 'Total expenditure', 'thinness  1-19 years', 'thinness 5-9 years']:
  sns.boxplot(df[i])
  plt.show()


# Duplicate or Garbage Values


In [None]:
df.drop_duplicates()

# Encoding of Data


In [None]:
dummy = pd.get_dummies(data=df, columns=["Country", "Status"], drop_first = True) # one hot encoding

In [None]:
dummy

In [None]:
df.columns