# Loading Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline

# Importing data

In [2]:
df = pd.read_csv('DataFiles\insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
df.duplicated().sum()

1

In [5]:
df.drop_duplicates(inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


# PreProcessing

## Creating our splits

In [6]:
X = df.drop(columns = 'charges')
y = df['charges']

X.shape

(1337, 6)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
763,27,male,26.03,0,no,northeast
1079,63,male,33.66,3,no,southeast
178,46,female,28.90,2,no,southwest
287,63,female,26.22,0,no,northwest
1290,38,female,19.95,2,no,northeast
...,...,...,...,...,...,...
1096,51,female,34.96,2,yes,northeast
1131,27,male,45.90,2,no,southwest
1295,20,male,22.00,1,no,southwest
861,38,female,28.00,3,no,southwest


## Creating the pipelines

### Ordinal
There are no Ordinal Columns

### Nominal

In [8]:
# Column Selector
cat_selector = make_column_selector(dtype_include='object')

# PreProcessor
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

# Tuple
cat_tuple = (ohe,cat_selector)

### Numeric

In [9]:
# Column Selector
num_selector = make_column_selector(dtype_include='number')

# PreProcessor
scaler = StandardScaler()

# Tuple
num_tuple = (scaler, num_selector)

### Transformer

In [10]:
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')
col_transformer

### Model

In [11]:
# Creating the model instance
lr = LinearRegression()

# Creating the pipeline
pipeline = make_pipeline(col_transformer,lr) 
pipeline

## Fitting the Model Pipeline to the Data

In [12]:
pipeline.fit(X_train, y_train)

In [13]:
y_hat_train = pipeline.predict(X_train)
y_hat_test = pipeline.predict(X_test)

## Measuring the model using r2

In [14]:
train_r2 = r2_score(y_train, y_hat_train)
test_r2 = r2_score(y_test, y_hat_test)

print(f'Model Training R2: {train_r2}')
print(f'Model Testing R2: {test_r2}')

Model Training R2: 0.7297491098061568
Model Testing R2: 0.7959403124253074


### Interpretation
Looks like the model accounts for 73% of the variance in the target, using the given features
Oddly enough, the testing data has a better fit of 80%. 