In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import category_encoders as ce   #for importing the encoder


print('import ran')

In [4]:
df = pd.read_csv('../input/insurance/insurance.csv')
df

In [5]:
### print out all of the column names and the contents in the dataset
col_names = df.columns
print(col_names,'\n#########################')
for i in col_names:
  print("Column : ",i)
  display(df[i].value_counts())
  print("#######################")

In [6]:
### checking for the number of unique values in the dataset
columns = df.columns
for i in columns:
  print("Column : ",i)
  display(len(df[i].unique()))
  print("#######################")

In [7]:
### checking if there are any null values in the given data set
df.isnull().sum()

In [8]:
### identifying categorical variables
cat_vars = [var for var in df.columns if df[var].dtypes == 'O']
print(cat_vars)
for i in cat_vars:
  print(i)
  print(df[i].unique())
  print('---------------------------------------------------')

### encoding the categorical variables
df['sex'] = df['sex'].replace(['male','female'],[1,0])
df['smoker'] = df['smoker'].replace(['yes','no'],[1,0])

#Create object for one-hot encoding
OH_encoder=ce.OneHotEncoder(cols='region',handle_unknown='return_nan',return_df=True,use_cat_names=True) #for region col

# encode dataset
data_encoded = OH_encoder.fit_transform(df)
data_encoded

# <h1> now we will visualize the data to gain insights <h1>

In [11]:
# plot the heatmap of correlation between different columns
import matplotlib.pyplot as plt
import seaborn as sns

corr = data_encoded.corr()
plt.subplots(figsize=(10,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [12]:
### bar charts of all the cols

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec


### age againts charges viewed using line graph
xlabs = data_encoded['age'].unique()
sns.relplot(x='age', y='charges', data=data_encoded , kind='line', height=4.5, aspect=5)
plt.ylabel('charges', fontsize=14)
plt.xlabel('age', fontsize=14)
plt.title('how much increase and decrease is noted with the increment of age', fontsize=16)
plt.xticks(xlabs)

### bar plots(categorically viewing) of different categories against charges
bar_x = ['sex','children','smoker']
for i in range(len(bar_x)):
        sns.catplot(data=data_encoded, x=bar_x[i], y='charges', kind='bar')
        plt.ylabel('Charges')
        plt.xlabel(bar_x[i])


        
###barplot of different regions against charges and separated by smokers in that region
sns.catplot(data=df, x='region', y='charges', kind='bar')
sns.catplot(data=df, x='region', y='charges',hue='smoker', kind='bar')
plt.ylabel('Charges')
plt.xlabel('regions')



plt.show()

In [13]:
### from the correlation heat map we can see that charges are highly positively 
### correlated with the smokers moreover bmi and age are also positively correlated with the charges

sns.relplot(data=data_encoded, x='bmi', y='charges', hue='smoker', height=5, aspect=1.5)
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.title('BMI vs Charges Based on Smoking Habit', fontsize=16)


In [14]:
sns.lmplot(data=data_encoded, x='bmi', y='charges', hue='smoker', col='sex')
# plt.title('BMI vs Charges Based on Smoking Habit of different genders', fontsize=16)

In [15]:
sns.lmplot(data=df, x='bmi', y='charges', hue='smoker', col='sex', row='region')
# plt.title('BMI vs Charges Based on Smoking Habit of different genders residing in different regions', fontsize=16)

<h1>Now we have completed the preliminary investigation we will now try out different regression models to try to predict charges</h1>

<h1>We will normalize the data so the values of different col can be in the range of 0 and 1</h1>

In [299]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


df_normalized = data_encoded.copy()
df_normalized.head(5)
scaler = MinMaxScaler()
# scaler = StandardScaler()
df_normalized[['bmi','age']] = scaler.fit_transform(df_normalized[['bmi','age']])
# df_normalized[['bmi']] = scaler.fit_transform(df_normalized[['bmi']])
df_normalized.head(3)

<h2>Linear Regression Model</h2>
<h3>Train test split and predictors and target selection</h3>

In [300]:
### linear regression model

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

lin_x = np.array(df_normalized['smoker']).reshape(-1, 1)
lin_y = np.array(df_normalized['charges']).reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(lin_x, lin_y, test_size = 0.25, random_state=42)
 
regr = LinearRegression()
regr.fit(X_train, y_train)

print('intercept and coefficient')
print(regr.intercept_)
print(regr.coef_)

y_pred = regr.predict(X_test)

# df_preds = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred.squeeze()})
# print(df_preds)

### model evaluation
print('model reports')
print(regr.score(X_test, y_test))

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')


In [306]:
### we will use this train test split for non linear models
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

predictors = df_normalized[['age','sex','bmi','smoker']]
target = df_normalized['charges']

X_train, X_test, y_train, y_test = train_test_split(predictors,target, test_size=0.2, random_state=42, shuffle=False)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
plt.plot(X_train,y_train,".b",label="train",markersize=12) # ".b" means plot individual markers (".") in blue ("b")
plt.plot(X_test,y_test,".g",label="test",markersize=12) # ".g" means plot individual markers (".") in green ("g")
plt.legend()
plt.show()

In [302]:
### multivariate regression model

regressor = LinearRegression()
regressor.fit(X_train, y_train)

print(regressor.intercept_, regressor.coef_)

y_pred = regressor.predict(X_test)
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results)

### evaluation
# X_test = np.array(X_test).reshape(-1, 1)
# y_test = np.array(y_test).reshape(-1, 1)

test_score = regressor.score(X_test, y_test)
print('test score:',test_score)
train_score = regressor.score(X_train, y_train)
print('train_score:',train_score)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [303]:
#### Ridge regression
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

y_pred = ridge_model.predict(X_test)

# df_preds = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred.squeeze()})
# print(df_preds)

test_score = ridge_model.score(X_test, y_test)
print('test score:',test_score)
train_score = ridge_model.score(X_train, y_train)
print('train_score:',train_score)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')


In [304]:
### lasso regression
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

y_pred = lasso_model.predict(X_test)

# df_preds = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred.squeeze()})
# print(df_preds)

test_score = lasso_model.score(X_test, y_test)
print('test score:',test_score)
train_score = lasso_model.score(X_train, y_train)
print('train_score:',train_score)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')


In [313]:
### KNN Model
from sklearn.neighbors import KNeighborsRegressor

n_lst = []
test_scores_lst = []
train_scores_lst = []

def neighbor_algo(n):
    neigh_model = KNeighborsRegressor(n_neighbors=n)
    neigh_model.fit(X_train, y_train)

    y_pred = neigh_model.predict(X_test)

    # df_preds = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred.squeeze()})
    # print(df_preds)

    test_score = neigh_model.score(X_test, y_test)
#     print(test_score)
    if float(test_score) >= 0.81: 
        train_score = neigh_model.score(X_train, y_train)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        n_lst.append(n)
        test_scores_lst.append(test_score)
        train_scores_lst.append(train_score)
        print('the n selected is:',n)
        print('test score:',test_score)
        print('train_score:',train_score)
        print(f'Mean absolute error: {mae:.2f}')
        print(f'Mean squared error: {mse:.2f}')
        print(f'Root mean squared error: {rmse:.2f}')
        print('----------------------------------------------------------------------------------------------------')
    else:
        pass
    
for i in range(1,21):
    neighbor_algo(i)

knn_df = pd.DataFrame(list(zip(n_lst,test_scores_lst,train_scores_lst)),columns =['Neighbors', 'Test Score','Train Score'])
knn_df.plot(x='Neighbors', y=['Test Score','Train Score'],kind='bar',figsize=(10,7),ylim=(0.8,0.9))

In [307]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg=PolynomialFeatures(degree=2)
X_poly=poly_reg.fit_transform(predictors)

pX_train, pX_test, py_train, py_test = train_test_split(X_poly,target, test_size=0.20, random_state=42, shuffle=False)
print(pX_train.shape, pX_test.shape, py_train.shape, py_test.shape)
plt.plot(pX_train,py_train,".b",label="train",markersize=12) # ".b" means plot individual markers (".") in blue ("b")
plt.plot(pX_test,py_test,".g",label="test",markersize=12) # ".g" means plot individual markers (".") in green ("g")
# plt.legend()
plt.show()

poly_reg_model = LinearRegression()
poly_reg_model.fit(pX_train, py_train)

poly_reg_y_predicted = poly_reg_model.predict(pX_test)

df_preds = pd.DataFrame({'Actual': py_test.squeeze(), 'Predicted': poly_reg_y_predicted.squeeze()})
print(df_preds)
# test_score = poly_reg_y_predicted.score(X_test, y_test)
# print('test score:',test_score)
# train_score = poly_reg_y_predicted.score(X_train, y_train)
# print('train_score:',train_score)

mae = mean_absolute_error(py_test, poly_reg_y_predicted)
mse = mean_squared_error(py_test, poly_reg_y_predicted)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')
print('----------------------------------------------------------------------------------------------------')

In [309]:
### decision tree
from sklearn.tree import DecisionTreeRegressor

predictors = df_normalized[['age','sex','bmi','smoker']]
print(predictors.head(3))

target = df_normalized['charges']
X_train, X_test, y_train, y_test = train_test_split(predictors,target, test_size=0.20, random_state=42, shuffle=False)

DT_model = DecisionTreeRegressor(random_state=42)
DT_model.fit(X_train, y_train)
predictions = DT_model.predict(X_test)

test_score = DT_model.score(X_test, y_test)
print('test score:',test_score)
train_score = DT_model.score(X_train, y_train)
print('train_score:',train_score)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')


In [310]:
from sklearn.tree import plot_tree
plt.figure(figsize=(10,8), dpi=150)
plot_tree(DT_model, feature_names=predictors.columns);