In [47]:
import pandas as pd
import numpy as np
import statistics as stat
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [2]:
insurance_data = pd.read_csv("D:/Python Materials/insurance.csv")

In [3]:
insurance_data.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,Premium
0,19.0,female,27.9,0,yes,southwest,16884.924
1,18.0,male,33.77,1,no,southeast,1725.5523
2,28.0,male,33.0,3,no,southeast,4449.462
3,33.0,male,22.705,0,no,northwest,21984.47061
4,32.0,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance_data.shape

(1338, 7)

In [5]:
insurance_data.isnull().sum() # This is a number of null values by column names 

age         12
gender       0
bmi         10
children     0
smoker       0
region       2
Premium      5
dtype: int64

In [6]:
insurance_data[insurance_data.isnull().any(axis=1)] # This are the rows with one of more null values

Unnamed: 0,age,gender,bmi,children,smoker,region,Premium
7,37.0,female,,3,occasionally,northwest,7281.5056
8,37.0,male,,2,occasionally,northeast,6406.4107
9,60.0,female,,0,occasionally,northwest,28923.13692
13,,female,39.82,0,occasionally,southeast,11090.7178
14,,male,42.13,0,occasionally,southeast,39611.7577
15,,male,24.6,1,occasionally,southwest,1837.237
16,,female,30.78,1,occasionally,northeast,10797.3362
17,,male,23.845,0,occasionally,northeast,2395.17155
18,,male,40.3,0,occasionally,southwest,10602.385
34,28.0,male,,1,occasionally,southwest,51194.55914


In [7]:
mean_age = np.mean(insurance_data['age'])
mean_bmi = np.mean(insurance_data['bmi'])
mode_gender = stat.mode(insurance_data['gender'])
mean_premium = np.mean(insurance_data['Premium'])

mean_age, mean_bmi, mode_gender, mean_premium

(39.22775263951735, 30.678795180722876, 'male', 13277.743308221308)

In [8]:
insurance_data = insurance_data.fillna(value={'age': mean_age,
                       'bmi': mean_bmi,
                       'gender': mode_gender,
                        'Premium': mean_premium}) # Filling the missing values with mean and mode

In [9]:
insurance_data

Unnamed: 0,age,gender,bmi,children,smoker,region,Premium
0,19.0,female,27.900,0,yes,southwest,16884.92400
1,18.0,male,33.770,1,no,southeast,1725.55230
2,28.0,male,33.000,3,no,southeast,4449.46200
3,33.0,male,22.705,0,no,northwest,21984.47061
4,32.0,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50.0,male,30.970,3,no,northwest,10600.54830
1334,18.0,female,31.920,0,no,northeast,2205.98080
1335,18.0,female,36.850,0,no,southeast,1629.83350
1336,21.0,female,25.800,0,no,southwest,2007.94500


In [10]:
insurance_data['Premium'].dtype

dtype('float64')

In [11]:
def check_outlier(insurance_data_param):
    outliers = {}
    col_list = insurance_data_param.columns.tolist()
    for col_name in col_list:
        if insurance_data_param[col_name].dtype != 'O': # Rejecting Object datatype
            Q1 = np.percentile(insurance_data_param[col_name],25)
            Q3 = np.percentile(insurance_data_param[col_name],75)
            IQR = Q3 - Q1
            lower_bound = Q1 - (1.5 * IQR)
            upper_bound = Q3 + (1.5 * IQR)
            outliers[col_name] = [lower_bound, upper_bound]
    return outliers

In [12]:
# This will set the lower and upper bound of columns, 
# Data points going beyond this bounds are outliers and we can replace that value with loower and upper bound 
check_outlier(insurance_data)  

{'age': [-9.0, 87.0],
 'bmi': [13.803125000000003, 47.168124999999996],
 'children': [-3.0, 5.0],
 'Premium': [-12997.197986250001, 34333.227843750006]}

In [13]:
insurance_data.corr() # This will show the correlation between all numeric columns - pearson corelation

Unnamed: 0,age,bmi,children,Premium
age,1.0,0.108295,0.045115,0.301975
bmi,0.108295,1.0,0.010533,0.196604
children,0.045115,0.010533,1.0,0.066826
Premium,0.301975,0.196604,0.066826,1.0


In [23]:
insurance_data['children'].dtype

dtype('int64')

In [26]:
insurance_data['children'] = insurance_data['children'].astype('O') # COnverting children column to Object datatype

In [27]:
insurance_data['children'].dtype

dtype('O')

In [49]:
insurance_data = pd.get_dummies(insurance_data)

In [50]:
feature = insurance_data.drop(['Premium'],axis=1)
target = insurance_data['Premium']

In [51]:
feature_train, feature_test, target_train, target_test = train_test_split(feature,target,test_size=0.3,random_state=42)

In [52]:
feature_train, feature_test, target_train, target_test

(       age     bmi  gender_female  gender_male  children_0  children_1  \
 332   61.0  31.160              1            0           1           0   
 355   46.0  27.600              0            1           1           0   
 138   54.0  31.900              1            0           0           0   
 381   55.0  30.685              0            1           1           0   
 292   25.0  45.540              0            1           0           0   
 ...    ...     ...            ...          ...         ...         ...   
 1095  18.0  31.350              1            0           0           0   
 1130  39.0  23.870              1            0           0           0   
 1294  58.0  25.175              0            1           1           0   
 860   37.0  47.600              1            0           0           0   
 1126  55.0  29.900              0            1           1           0   
 
       children_2  children_3  children_4  children_5  smoker_no  \
 332            0           0 

In [57]:
lm_model = sm.OLS(target_train, feature_train).fit()

In [58]:
lm_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2117756a988>

In [60]:
prediction = lm_model.predict(feature_test)

In [62]:
prediction.head()

764     10155.071210
887      6148.540544
890     36061.564863
1293     8642.194651
259      7963.972894
dtype: float64

In [63]:
residual = prediction - target_test

In [65]:
residual.head()

764      1060.002960
887       876.364744
890      6730.581713
1293     -659.698899
259    -25786.318906
dtype: float64

In [66]:
from sklearn.metrics import mean_squared_error

In [67]:
mse = mean_squared_error(prediction, target_test)

In [68]:
mse

68947487.90457097

In [69]:
import matplotlib.pyplot as plt
import seaborn as sns

In [70]:
sns

<module 'seaborn' from 'C:\\Users\\tarun.dabhi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\seaborn\\__init__.py'>