# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Loading the data into a dataframe

In [2]:
path = 'DataFiles\insurance.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# PreProcessing

### Defining our X and y
Using this dataset, we are going to try and forecast the charges using age, sex, bmi, number of children, smoking habit, and region of each patient. Meaning, our y is charges (target vector), and our X is everything else (feature matrix). lets set that up now

In [4]:
X = df.drop(columns = ['charges'])
y = df['charges']

### Is this a classification or regression?
* This is a regression. We are trying to predict a continuous value.

### Identifying the features
My numerical features are:
* age
* bmi
* children

We dont have any ordinal features.

My categorical features are:
* sex
* smoker
* region

I will need to OneHotEncode the categorical variables. First I will split the data, then I'll begin transforming everything

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
print(f'Training data set size: {X_train.shape}')
print(f'Test data set size: {X_test.shape}')
print(f'Training target set size: {y_train.shape}')
print(f'Test target set size: {y_test.shape}')

Training data set size: (1003, 6)
Test data set size: (335, 6)
Training target set size: (1003,)
Test target set size: (335,)


Split looks good. Next step is to encode my categorical features

### One Hot Encoding

In [7]:
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data

Unnamed: 0,sex,smoker,region
693,male,no,northwest
1297,female,no,southeast
634,male,no,southwest
1022,male,yes,southeast
178,female,no,southwest
...,...,...,...
1095,female,no,northeast
1130,female,no,southeast
1294,male,no,northeast
860,female,yes,southwest


In [8]:
one_hot = OneHotEncoder(sparse=False, handle_unknown='ignore')
one_hot.fit(train_cat_data)

train_onehot = one_hot.transform(train_cat_data)
test_onehot = one_hot.transform(train_cat_data)
train_onehot

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 1.]])

* So far so good. We now have some Numpy Arrays for our encoded categorical features.
* Next step is to grab the columns from the encoder, and convert the arrays into dataframes. We will then use thoe dataframes to concatenate with the numerical data.

In [9]:
# Store the columns in a variable using the following line of code:
onehot_columns = one_hot.get_feature_names_out(train_cat_data.columns)

# Convert the arrays into dataframes:
train_onehot = pd.DataFrame(train_onehot, columns = onehot_columns)
test_onehot = pd.DataFrame(test_onehot, columns = onehot_columns)

# Resetting index. I dont think this is necessary here, but i will do it for best practices
train_onehot.reset_index(drop = True, inplace = True)
test_onehot.reset_index(drop = True, inplace = True)

train_onehot

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Scaling
Now that I have encoded the categorical data, I need to scale the numerical data. I will just standardize every variable for now

In [10]:
train_nums = X_train[num_selector(X_train)]
test_nums = X_test[num_selector(X_train)]

train_nums.describe()

Unnamed: 0,age,bmi,children
count,1003.0,1003.0,1003.0
mean,39.255234,30.51178,1.104686
std,14.039105,6.013107,1.204619
min,18.0,15.96,0.0
25%,27.0,26.21,0.0
50%,39.0,30.2,1.0
75%,51.0,34.43,2.0
max,64.0,53.13,5.0


* We will standardize these features to make the mean 0 and the std dev 1

In [11]:
# Creating the object and fitting the training data:
scaler = StandardScaler()
scaler.fit(train_nums)

# Transforming the train and test:
train_nums_scaled = scaler.transform(train_nums)
test_nums_scaled = scaler.transform(test_nums)

train_nums_scaled

array([[-1.08716652, -1.14087456, -0.91749963],
       [-0.80210593, -0.66584152,  0.7436053 ],
       [ 0.83699246,  1.52879447, -0.08694717],
       ...,
       [ 1.33584849, -0.8879673 , -0.91749963],
       [-0.1607196 ,  2.84324666,  0.7436053 ],
       [ 1.12205304, -0.10179179, -0.91749963]])

* Now to convert the arrays to DataFrames

In [12]:
train_nums_scaled = pd.DataFrame(train_nums_scaled, columns = train_nums.columns)
test_nums_scaled = pd.DataFrame(test_nums_scaled, columns = test_nums.columns)

train_nums_scaled.reset_index(drop = True, inplace = True)
test_nums_scaled.reset_index(drop = True, inplace = True)

train_nums_scaled.head()

Unnamed: 0,age,bmi,children
0,-1.087167,-1.140875,-0.9175
1,-0.802106,-0.665842,0.743605
2,0.836992,1.528794,-0.086947
3,0.551932,0.926476,-0.086947
4,0.480667,-0.268178,0.743605


In [13]:
train_nums_scaled.describe().round(2)

Unnamed: 0,age,bmi,children
count,1003.0,1003.0,1003.0
mean,0.0,0.0,0.0
std,1.0,1.0,1.0
min,-1.51,-2.42,-0.92
25%,-0.87,-0.72,-0.92
50%,-0.02,-0.05,-0.09
75%,0.84,0.65,0.74
max,1.76,3.76,3.24


Success. now to complete the assignmentby concatenating everything

### Concatenating the Dataframes
Since I needed to scale the numerical features, I already have a new index for it. So I dont have to reset anything.

In [14]:
# Concatenating
X_train_processed = pd.concat([train_nums_scaled,train_onehot], axis = 1)
X_test_processed = pd.concat([test_nums_scaled,test_onehot], axis = 1)

X_train_processed

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.087167,-1.140875,-0.917500,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.802106,-0.665842,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.836992,1.528794,-0.086947,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.551932,0.926476,-0.086947,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.480667,-0.268178,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
998,-1.514757,0.139468,2.404710,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,-0.018189,-1.105101,3.235263,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,1.335848,-0.887967,-0.917500,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,-0.160720,2.843247,0.743605,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
