In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [17]:
df = pd.read_csv('insurance.csv', delimiter=',')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
# Convert categorical data to numerical data

sexs = []
smokers = []

for sex, smoker in zip(df['sex'], df['smoker']):
    sexs.append(1 if sex == 'male' else 0)
    smokers.append(1 if smoker == 'yes' else 0)

df['sex'] = sexs
df['smoker'] = smokers

df = df.drop(['region', 'children'], axis=1)

df.head()

Unnamed: 0,age,sex,bmi,smoker,charges
0,19,0,27.9,1,16884.924
1,18,1,33.77,0,1725.5523
2,28,1,33.0,0,4449.462
3,33,1,22.705,0,21984.47061
4,32,1,28.88,0,3866.8552


In [24]:
inputs = df.drop(['smoker'], axis=1)
outputs = df['smoker']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.333, random_state=42)

In [25]:
model = LogisticRegression(penalty=None).fit(X_train, y_train)

print('Training score: {}'.format(model.score(X_train, y_train)))
print('Testing score: {}'.format(model.score(X_test, y_test)))

Training score: 0.945067264573991
Testing score: 0.9282511210762332


In [40]:
# Print coefficients
import math

print("Weights of ")
for i, col in enumerate(inputs.columns):
    print(f"{col}: {float(model.coef_[0][i])}")

Weights of 
age: -0.07745046881613492
sex: -0.0012833319104187954
bmi: -0.1555075045085065
charges: 0.0003444564721044799


In [46]:
# Testing out with custom data

# 1. A 19 year old male with a BMI of 27.9 and charges of $16884.92

print('Predicted smoker: {}'.format(model.predict([[80, 30, 0, 32100.92]])))

Predicted smoker: [1]




In [52]:
# Predict Charge

from sklearn.linear_model import LinearRegression

X = df.drop(['charges'], axis=1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42)

model = LinearRegression().fit(X_train, y_train)

print('Training score: {}'.format(model.score(X_train, y_train)))
print('Testing score: {}'.format(model.score(X_test, y_test)))

Training score: 0.7408936836331587
Testing score: 0.7597566523705576


In [53]:
print("Weights of ")

for i, col in enumerate(X.columns):
    print(f"{col}: {float(model.coef_[i])}")


Weights of 
age: 264.1599568875083
sex: 164.12574666385342
bmi: 330.2419793306889
smoker: 23661.170808102466


In [54]:
person = [[19, 27.9, 0, 1]]

print('Predicted charge: {}'.format(model.predict(person)))

Predicted charge: [21127.98525882]


