In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv(r'claims_data.csv')
df.head()

Unnamed: 0,age,sex,bmi,steps,children,smoker,region,insurance_claim,claim_amount
0,19,female,27.9,3009,0,yes,southwest,yes,16884.924
1,18,male,33.77,3008,1,no,southeast,yes,1725.5523
2,28,male,33.0,3009,3,no,southeast,no,0.0
3,33,male,22.705,10009,0,no,northwest,no,0.0
4,32,male,28.88,8010,0,no,northwest,yes,3866.8552


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
age                1338 non-null int64
sex                1338 non-null object
bmi                1338 non-null float64
steps              1338 non-null int64
children           1338 non-null int64
smoker             1338 non-null object
region             1338 non-null object
insurance_claim    1338 non-null object
claim_amount       1338 non-null float64
dtypes: float64(2), int64(3), object(4)
memory usage: 94.2+ KB


In [4]:
#creating targets
df['Claimed'] = np.where(df['insurance_claim'] == 'yes', 1, 0) 

In [5]:
df = df.drop(['insurance_claim', 'claim_amount'], axis = 1)

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,steps,children,smoker,region,Claimed
0,19,female,27.9,3009,0,yes,southwest,1
1,18,male,33.77,3008,1,no,southeast,1
2,28,male,33.0,3009,3,no,southeast,0
3,33,male,22.705,10009,0,no,northwest,0
4,32,male,28.88,8010,0,no,northwest,1


In [7]:
df = pd.get_dummies(df)

In [8]:
df.head()

Unnamed: 0,age,bmi,steps,children,Claimed,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,3009,0,1,1,0,0,1,0,0,0,1
1,18,33.77,3008,1,1,0,1,1,0,0,0,1,0
2,28,33.0,3009,3,0,0,1,1,0,0,0,1,0
3,33,22.705,10009,0,0,0,1,1,0,0,1,0,0
4,32,28.88,8010,0,1,0,1,1,0,0,1,0,0


In [9]:
df.columns

Index(['age', 'bmi', 'steps', 'children', 'Claimed', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

In [10]:
df = df[['age', 'bmi','children', 'steps','sex_female', 'sex_male', 'smoker_no', 'smoker_yes', 'region_northeast','region_northwest', 'region_southeast', 'region_southwest', 'Claimed']]
df.head()

Unnamed: 0,age,bmi,children,steps,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,Claimed
0,19,27.9,0,3009,1,0,0,1,0,0,0,1,1
1,18,33.77,1,3008,0,1,1,0,0,0,1,0,1
2,28,33.0,3,3009,0,1,1,0,0,0,1,0,0
3,33,22.705,0,10009,0,1,1,0,0,1,0,0,0
4,32,28.88,0,8010,0,1,1,0,0,1,0,0,1


In [11]:
df.columns

Index(['age', 'bmi', 'children', 'steps', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'Claimed'],
      dtype='object')

In [12]:
y = df['Claimed']
X = df[['age', 'bmi', 'children', 'steps', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest']]

In [13]:
y.sum()/df.shape[0]

0.5852017937219731

# Standardizing the Data

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
    

In [15]:
columns_to_scale = ['age','children', 'bmi', 'steps']

In [16]:
Claim_scaler = CustomScaler(columns_to_scale)
Claim_scaler

CustomScaler(columns=['age', 'children', 'bmi', 'steps'], copy=None,
       with_mean=None, with_std=None)

In [17]:
Claim_scaler.fit(X)

  return self.partial_fit(X, y)


CustomScaler(columns=['age', 'children', 'bmi', 'steps'], copy=None,
       with_mean=None, with_std=None)

In [18]:
X = Claim_scaler.transform(X)



In [19]:
X.head()

Unnamed: 0,age,bmi,children,steps,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.438764,-0.45332,-0.908614,-0.945733,1,0,0,1,0,0,0,1
1,-1.509965,0.509621,-0.078767,-0.94614,0,1,1,0,0,0,1,0
2,-0.797954,0.383307,1.580926,-0.945733,0,1,1,0,0,0,1,0
3,-0.441948,-1.305531,-0.908614,1.908235,0,1,1,0,0,1,0,0
4,-0.513149,-0.292556,-0.908614,1.093223,0,1,1,0,0,1,0,0


# SPLIT THE DATA

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

# Train the model

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
model.score(X_train, y_train)

0.874766355140187

In [26]:
model.intercept_

array([0.84295639])

In [27]:
model.coef_

array([[ 0.43603612,  1.5243879 , -1.62295079,  0.12645866,  0.41103758,
         0.43191881, -1.54082704,  2.38378343,  0.49076107,  0.07460384,
         0.07575606,  0.20183543]])

In [28]:
X.columns

Index(['age', 'bmi', 'children', 'steps', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

In [29]:
features = ['age', 'bmi', 'children', 'steps', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest']
summary_table = pd.DataFrame(columns = ['features'], data = features)
summary_table['Coefficient'] = np.transpose(model.coef_)
summary_table['Odds Ratio'] = np.exp(summary_table['Coefficient'])
summary_table.sort_values('Odds Ratio', ascending = False)

Unnamed: 0,features,Coefficient,Odds Ratio
7,smoker_yes,2.383783,10.84586
1,bmi,1.524388,4.592332
8,region_northeast,0.490761,1.633559
0,age,0.436036,1.546565
5,sex_male,0.431919,1.54021
4,sex_female,0.411038,1.508382
11,region_southwest,0.201835,1.223647
3,steps,0.126459,1.134803
10,region_southeast,0.075756,1.078699
9,region_northwest,0.074604,1.077457


# TEST THE MODEL

In [30]:
model.score(X_test, y_test)

0.8992537313432836

In [31]:
Claim = pd.DataFrame(model.predict(X_test))

In [32]:
predictions = pd.DataFrame(model.predict_proba(X_test)[:,1])

In [33]:
unscaled_X = df[['age', 'bmi', 'children', 'steps', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest']]

In [34]:
predicted_outputs = pd.merge(unscaled_X, predictions, left_index = True, right_index = True)

In [35]:
predicted_outputs = pd.merge(predicted_outputs, Claim, left_index = True, right_index = True)
predicted_outputs.head()

Unnamed: 0,age,bmi,children,steps,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,0_x,0_y
0,19,27.9,0,3009,1,0,0,1,0,0,0,1,0.8484,1
1,18,33.77,1,3008,0,1,1,0,0,0,1,0,0.090733,0
2,28,33.0,3,3009,0,1,1,0,0,0,1,0,0.688797,1
3,33,22.705,0,10009,0,1,1,0,0,1,0,0,0.689625,1
4,32,28.88,0,8010,0,1,1,0,0,1,0,0,0.971388,1


In [36]:
predicted_outputs.rename(columns = {'0_x':'Predictions', '0_y':'Claim'},inplace = True)

In [37]:
predicted_outputs.to_excel('Predictions.xlsx')

In [42]:
predicted_outputs.to_csv('predicted.csv')

In [41]:
predicted_outputs.head()

Unnamed: 0,age,bmi,children,steps,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,Predictions,Claim
0,19,27.9,0,3009,1,0,0,1,0,0,0,1,0.8484,1
1,18,33.77,1,3008,0,1,1,0,0,0,1,0,0.090733,0
2,28,33.0,3,3009,0,1,1,0,0,0,1,0,0.688797,1
3,33,22.705,0,10009,0,1,1,0,0,1,0,0,0.689625,1
4,32,28.88,0,8010,0,1,1,0,0,1,0,0,0.971388,1


# SAVE THE MODEL

In [43]:
len(X.columns)

12

In [38]:
import pickle 

In [39]:
with open('model', 'wb') as file:
    pickle.dump(model, file)

In [40]:
with open('Claim_scaler', 'wb') as file:
    pickle.dump(Claim_scaler, file)