In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder

In [2]:
# Importing the datasets
df = pd.read_csv('data/05-insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# One-Hot encoding the smoker parameter
df_with_dummy = pd.get_dummies(df, columns = ['smoker'])
df_with_dummy.head()

Unnamed: 0,age,sex,bmi,children,region,charges,smoker_no,smoker_yes
0,19,female,27.9,0,southwest,16884.924,0,1
1,18,male,33.77,1,southeast,1725.5523,1,0
2,28,male,33.0,3,southeast,4449.462,1,0
3,33,male,22.705,0,northwest,21984.47061,1,0
4,32,male,28.88,0,northwest,3866.8552,1,0


In [4]:
# Data preparation

x_class = df_with_dummy[['age', 'bmi', 'smoker_no', 'smoker_yes']]
med = df["charges"].median()
y_class = [(chg <= med) for chg in df["charges"]]

In [5]:
# Create an instance of the model, finding the fitting it to data
regr = linear_model.LogisticRegression()
regr.fit(x_class, y_class)

LogisticRegression()

In [6]:
# Make predictions from the data
predictions = regr.predict(x_class)

# Check whether they match the labels
correctPredictions = predictions == y_class

# And compute the error
sum(correctPredictions) / len(correctPredictions)

0.905829596412556

In [7]:
# Converting objects labels into categorical
df['smoker'] = df['smoker'].astype('category')

# Converting category labels into numerical using LabelEncoder
label = LabelEncoder()
label.fit(df.smoker.drop_duplicates())
df.smoker = label.transform(df.smoker)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


In [8]:
# Data preparation
x_class = df[['age', 'bmi', 'smoker']]

# Create an instance of the model, finding the fitting it to data
regr = linear_model.LogisticRegression()
regr.fit(x_class, y_class)

LogisticRegression()

In [9]:
# Make predictions from the data
predictions = regr.predict(x_class)

# Check whether they match the labels
correctPredictions = predictions == y_class

# And compute the error
sum(correctPredictions) / len(correctPredictions)

0.905829596412556