### Importing important Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

### Importing the Dataset

In [None]:
df=pd.read_csv('../input/medical-insurance-premium-prediction/Medicalpremium.csv')

### Looking at the top 5 records

In [None]:
df.head()

### Looking at the Shape of the Dataset

In [None]:
print('Number of rows in the dataset',df.shape[0])
print('Number of columns in the dataset',df.shape[1])

### Information of the Dataset.

In [None]:
df.info()

Observation:

* I Observe that there are 11 colummns and 986 records
    
* Data type for all the Variables are integer but **Diabetes**, **BloodPressureProblems**, **AnyTransplants**,    **AnyChronicDiseases**, **KnownAllergies**, **HistoryOfCancerInFamily**, **NumberOfMajorSurgeries** are Actualy object variables

In [None]:
#Updating the  datatype for object variable

for i in df[['Diabetes', 'BloodPressureProblems', 'AnyTransplants', 'AnyChronicDiseases', 'KnownAllergies', 'HistoryOfCancerInFamily', 'NumberOfMajorSurgeries']] : 
     df[i]=df[i].astype('object')
        

In [None]:
df.info()

In [None]:
# We Observe that the data type has been updated.

### Checking for Null Values 

In [None]:
df.isnull().sum()

In [None]:
# We observe that there are no Null Values in the Dataset.

### Checking for Duplicate values

In [None]:
dups=df.duplicated()
print('Number of duplicate records',dups.sum())

In [None]:
# Seprating the object and numeric variable
cat = []
num = []
for i in df.columns:
    if df[i].dtype=='object':
        cat.append(i)
    else:
        num.append(i)

In [None]:
num,cat

#### Summary of Numeric Data

In [None]:
df[num].describe().T

#### Summary of Object Data

In [None]:
df[cat].describe().T

### Looking at the Unique values.

In [None]:
for i in df[cat]:
    print(i ,df[i].nunique())
    print(df[i].value_counts())
    print('************************************************')

### Checking for Outliers

In [None]:
for i in df[num]:
    fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(8,4))
    sns.histplot(df[i],kde=True,ax=axes[0],color='red')
    sns.boxplot(df[i],ax=axes[1],color='green')

### Univariate Analysis

In [None]:
f,ax=plt.subplots(2,2,figsize=(15,15))
df.Age.plot.hist(bins = 20, edgecolor = 'black',color = 'green',ax=ax[0][0])
ax[0][0].set_title('Distribution of age among users')

df.PremiumPrice.plot.hist(bins = 25, edgecolor = 'black',color = 'pink',ax=ax[0][1])
ax[0][1].set_title('Distribution of PremiumPrice of users')

df.Height.plot.hist(bins = 25, edgecolor = 'black',color = 'red',ax=ax[1][0])
ax[1][0].set_title('Distribution of Height of users')

df.Weight.plot.hist(bins = 25, edgecolor = 'black',color = 'violet',ax=ax[1][1])
ax[1][1].set_title('Distribution of Weight of users')


In [None]:
print('Distribution of Age')
print('Oldest User was of:',round(df['Age'].max()),'Years')
print('Youngest User was of:',round(df['Age'].min()),'Years')
print('Average Use age:',round(df['Age'].mean()),'Years')
print('**************************************************')

print('Distribution of Premium Price')
print('lowest Premium of a User was of:',round(df['PremiumPrice'].max()))
print('Youngest User was of:',round(df['PremiumPrice'].min()))
print('Average User Premium Price:',round(df['PremiumPrice'].mean()))
print('**************************************************')

print('Distribution of Height')
print('shortest Height of a User was of:',round(df['Height'].max()))
print('tallest User was of:',round(df['Height'].min()))
print('Average Height of  User:',round(df['Height'].mean()))
print('**************************************************')



In [None]:
f,ax=plt.subplots(2,3,figsize=(10,10))
df['Diabetes'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0][0])
df['AnyTransplants'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0][1])
df['AnyChronicDiseases'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0][2])
df['KnownAllergies'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[1][0])
df['HistoryOfCancerInFamily'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[1][1])
df['NumberOfMajorSurgeries'].value_counts().plot.pie(explode=[0,0,0,0],autopct='%1.1f%%',ax=ax[1][2])

### Multivariate Analysis

#### Relation between Age and Premium Price.

In [None]:
sns.lineplot(df['Age'],df['PremiumPrice'])
sns.lmplot(x='Age', y ='PremiumPrice',data=df)

In [None]:
print('Oldest User was of:',round(df['Age'].max()),'Years')
print('Youngest User was of:',round(df['Age'].min()),'Years')
print('Average User age:',round(df['Age'].mean()),'Years')

#### Relation between Weight and Premium Price.

In [None]:
sns.lineplot(df['Weight'],df['PremiumPrice'])
sns.lmplot(x='Weight', y ='PremiumPrice',data=df)

#### Relation between Height and Premium Price.

In [None]:
sns.lineplot(df['Height'],df['PremiumPrice'])
sns.lmplot(x='Height', y ='PremiumPrice',data=df)

#### Relation between Number Of Major Surgeries and Premium Price

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,5))
sns.barplot(data = df, x='NumberOfMajorSurgeries',y='PremiumPrice',ax=ax[0])
sns.boxplot(data = df, x='NumberOfMajorSurgeries',y='PremiumPrice',ax=ax[1])

#### Relation between History Of Cancer In Family and Premium Price.

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,5))
sns.barplot(data = df, x='HistoryOfCancerInFamily',y='PremiumPrice',ax=ax[0])
sns.boxplot(data = df, x='HistoryOfCancerInFamily',y='PremiumPrice',ax=ax[1])

#### Relation between Known Allergies In Family and Premium Price.

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,5))
sns.barplot(data = df, x='KnownAllergies',y='PremiumPrice',ax=ax[0])
sns.boxplot(data = df, x='KnownAllergies',y='PremiumPrice',ax=ax[1])

#### Relation between Any Chronic Diseases In Family and Premium Price.

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,5))
sns.barplot(data = df, x='AnyChronicDiseases',y='PremiumPrice',ax=ax[0])
sns.boxplot(data = df, x='AnyChronicDiseases',y='PremiumPrice',ax=ax[1])

#### Relation between Any Transplants In Family and Premium Price.

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,5))
sns.barplot(data = df, x='AnyTransplants',y='PremiumPrice',ax=ax[0])
sns.boxplot(data = df, x='AnyTransplants',y='PremiumPrice',ax=ax[1])

#### Relation between Diabetes In Family and Premium Price.

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,5))
sns.barplot(data = df, x='Diabetes',y='PremiumPrice',ax=ax[0])
sns.boxplot(data = df, x='Diabetes',y='PremiumPrice',ax=ax[1])

#### Relation between Blood Pressure Problems In Family and Premium Price.

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,5))
sns.barplot(data = df, x='BloodPressureProblems',y='PremiumPrice',ax=ax[0])
sns.boxplot(data = df, x='BloodPressureProblems',y='PremiumPrice',ax=ax[1])

#### Analyze the Premium Price by age according to the History Of Cancer In Family.

In [None]:
sns.lmplot(x = 'Age', y = 'PremiumPrice', data=df, hue='HistoryOfCancerInFamily', palette='Set1')

#### Analyze the Premium Price by age according to the Diabetes.

In [None]:
sns.lmplot(x = 'Age', y = 'PremiumPrice', data=df, hue='Diabetes', palette='Set2')

#### Analyze the Premium Price by age according to the Any Transplants.

In [None]:
sns.lmplot(x = 'Age', y = 'PremiumPrice', data=df, hue='AnyTransplants', palette='Set1')

#### Analyze the Premium Price by age according to the Any Any Chronic Diseases.

In [None]:
sns.lmplot(x = 'Age', y = 'PremiumPrice', data=df, hue='AnyChronicDiseases', palette='Set2')

#### Analyze the Premium Price by age according to the Any Known Allergies.

In [None]:
sns.lmplot(x = 'Age', y = 'PremiumPrice', data=df, hue='KnownAllergies', palette='Set1')

In [None]:
#### Analyze the Premium Price by age according to the Number Of Major Surgeries.

In [None]:
sns.lmplot(x = 'Age', y = 'PremiumPrice', data=df, hue='NumberOfMajorSurgeries')

#### Analyze the Premium Price by age according to the Blood Pressure Problems.

In [None]:
sns.lmplot(x = 'Age', y = 'PremiumPrice', data=df, hue='BloodPressureProblems', palette='Set1')

### Pair Plot and Correlation Heatmap.

In [None]:
sns.pairplot(df,corner=True)

In [None]:
sns.heatmap(df.corr(),annot=True,cmap='cool')

In [None]:
df.columns

### Conclusion

Based on the above observation  factors that play an important role in determining the premium of the user are:
    
    1.  Age
    2.  AnyTransplants
    3.  AnyChronicDiseases
    4.  Weight:
    5.  HistoryOfCancerInFamily  
    6.  NumberOfMajorSurgeries
    7.  Height
   
We also Observe that below factors have insignificant or no Impact on the Premium Price:

     1. KnownAllergies
     2. BloodPressureProblems
     3. Diabetes
     

### We further try and look at the important features using machine learning in this case we use Random forest.

### Dependent and Independent Features

In [None]:
X = df.drop('PremiumPrice',axis=1)
y = df.PremiumPrice

### Normalization

Normalization scales each input variable separately to the range 0-1

In [None]:
scalar =  StandardScaler()
X.Age = scalar.fit_transform(X[['Age']])
X.Height = scalar.fit_transform(X[['Height']])
X.Weight = scalar.fit_transform(X[['Weight']])

### Train Test Split

In [None]:
# Splitting the Dataset into 75 :25
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=43)

# Modeling Using Random Forest Regressor

In [None]:

#A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset 
# and uses averaging to improve the predictive accuracy and control over-fitting

Lr=RandomForestRegressor(n_estimators= 50,max_depth=5)
Lr_model=Lr.fit(X_train,y_train)

In [None]:
print('Accuracy of the model on training Dataset ',Lr_model.score(X_train,y_train))
print ('Accuracy of the model on Test Dataset ',Lr_model.score(X_test,y_test))

### Finding Important Features

In [None]:
#Using the Feature importance to Extract the Importance of the Features.
feature_imp = Lr_model.feature_importances_
sns.barplot(x=feature_imp, y=X.columns)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title("Important Features")
plt.show();

### We Perform Random Forest and further identify Important factors that determine Premium Prices.

As we can see in the above plot we observe that.

* Age is the most important Factor folowed by AnyTransplant, Weight and other.
* KnownAllergies,Diabetes, BloodPressureProblems are the least important Factor in determining the Premium Price.
* We can get better and more forcing results if we had more data for our Analysis.
