# Prediction of health insurance amount

In [1]:
#Import libraries
import pandas as pd
import pandas_profiling as pp
import numpy as np
import seaborn as sns
import matplotlib.pyplot as mp

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#Read dataset
heart = pd.read_csv(r"..\Dataset\heart.csv")
diab = pd.read_csv(r"..\Dataset\diabetes.csv")
cancer = pd.read_csv(r"..\Dataset\can.csv")

Three most expensive diseases in the world according to National Association for Biomedical Research :
1. Heart disease
2. Diabetes 
3. Cancer 

## Analysis of Disease Indicators
We will analyse each dataset of the diseases such as heart disease, diabetes and cancer to find risk factors of the particular disease. We will then apply intersection between three sets for feature selection.

### Analysis of Heart Disease Indicators dataset

In [None]:
#view dataset
heart.head(10)

In [None]:
heart.info()

In [None]:
df = heart.copy()

In [None]:
importances = df.drop("HeartDiseaseorAttack", axis=1).apply(lambda x: x.corr(df.HeartDiseaseorAttack))
indices = np.argsort(importances)
print(importances[indices])

In [44]:
def find_correlations(X):
    for i in range(0,len(X.columns)):
        for j in  range(0,len(X.columns)):
            if i!=j:
                corr_1=np.abs(X[X.columns[i]].corr(X[X.columns[j]]))
                if corr_1 <0.2:
                    print( X.columns[i] , " is not correlated  with ", X.columns[j])
                elif corr_1>=0.2:
                    print( X.columns[i] , " is correlated  with ", X.columns[j])
                

In [None]:
#Finding independant features
X= df[['Age','HighBP','Stroke','HighChol','Smoker','Sex','BMI','CholCheck']]
find_correlations(X)

Selected features : Age, Stroke, Smoker, Sex and BMI.

In [None]:
data = heart[["HeartDiseaseorAttack","CholCheck","BMI","Smoker","Stroke","Sex","Age"]]
corr = data.corr()

ax1 = sns.heatmap(corr, cbar=0, linewidths=2,vmax=1, vmin=0, square=True, cmap='Blues')
mp.show()

In [None]:
#Report
report1 = pp.ProfileReport(df)

#Save the report in HTML format.
report1.to_file(".\Report\Heart.html")

### Analysis of Diabetes Indicators dataset

In [None]:
#Read dataset
diab.head(10)

In [None]:
diab.info()

In [None]:
df1 = diab.copy()

In [None]:
importances = df1.drop("diabetes", axis=1).apply(lambda x: x.corr(diab.diabetes))
indices = np.argsort(importances)
print(importances[indices])

In [None]:
df2 = df1[["HighBP","BMI","HighChol","Age","Stroke","CholCheck","Smoker","Sex"]]
find_correlations(df2)

Selected features : BMI, Age, Stroke, Cholcheck, Smoker, Sex

In [None]:
data = df1[["diabetes","BMI","Age","Stroke","CholCheck","Smoker","Sex"]]
corr = data.corr()

In [None]:
ax1 = sns.heatmap(corr, cbar=0, linewidths=2,vmax=1, vmin=0, square=True, cmap='Blues')
mp.show()

In [None]:
#Report
report1 = pp.ProfileReport(df1)

#Save the report in HTML format.
report1.to_file(".\Report\Diabetes.html")

### Analysis of Cancer Indicators dataset

In [None]:
cancer.head(10)

In [None]:
cancer.info()

In [None]:
percent_missing = cancer.isnull().sum() * 100 / len(cancer)
missing_value_df = pd.DataFrame({'column_name': cancer.columns,
                                 'percent_missing': percent_missing})
missing_value_df

In [None]:
cancer1 = cancer.dropna()

In [None]:
cancer1.info()

In [None]:
cancer1.loc[cancer1["Level"] == "Low", "Level"] = 0
cancer1.loc[cancer1["Level"] == "Medium", "Level"] = 1
cancer1.loc[cancer1["Level"] == "High", "Level"] = 2

cancer1["Level"] = cancer1["Level"].astype(str).astype(int)

In [47]:
importances = cancer1.drop("can", axis=1).apply(lambda x: x.corr(cancer1.can))
indices = np.argsort(importances)
print(importances[indices])

Age        -0.164629
EDUCA      -0.031161
Smoker     -0.003633
Gender      0.002178
BMI         0.013979
Region      0.015616
MARITAL     0.066030
Children    0.071847
Cancer      0.173176
dtype: float64


In [49]:
sv= cancer1[["Children","MARITAL","Region","BMI","Gender"]]
find_correlations(sv)

Children  is not correlated  with  MARITAL
Children  is not correlated  with  Region
Children  is not correlated  with  BMI
Children  is not correlated  with  Gender
MARITAL  is not correlated  with  Children
MARITAL  is not correlated  with  Region
MARITAL  is not correlated  with  BMI
MARITAL  is not correlated  with  Gender
Region  is not correlated  with  Children
Region  is not correlated  with  MARITAL
Region  is not correlated  with  BMI
Region  is not correlated  with  Gender
BMI  is not correlated  with  Children
BMI  is not correlated  with  MARITAL
BMI  is not correlated  with  Region
BMI  is not correlated  with  Gender
Gender  is not correlated  with  Children
Gender  is not correlated  with  MARITAL
Gender  is not correlated  with  Region
Gender  is not correlated  with  BMI
