<a href="https://colab.research.google.com/github/Rahafhosari/DataScience2024-2025/blob/master/model_pipeline_core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Name : Rahaf Hosari

## Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import set_config
set_config(transform_output='pandas')

## Load Data

In [5]:
path = '/content/drive/MyDrive/AXSOSACADEMY/02-IntroML/Week06/Model Pipeline/insurance.csv'
df = pd.read_csv(path)

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## EDA

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Null Values

In [10]:
df.isna().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


No Null values

### Duplicates

In [11]:
df.duplicated().sum()

1

There's one duplicated row

In [23]:
df = df.drop_duplicates()

In [26]:
df.shape

(1337, 7)

In [25]:
df.duplicated().sum()

0

## Split Train Test

 Define X and y, with "charges" as the target vector (y).

In [30]:
y = df['charges']
X = df.drop(columns='charges')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Column Transformer
Create a column transformer that will:

   - Impute missing values (if needed)

   - One-hot encode any nominal features

   - Scale any numeric features (no ordinal features in this dataset)

In [None]:
# Since there are no null values there's no need for Imputer

In [31]:
# OHE
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_encoder

In [32]:
#Scale Numeric Features
scaler = StandardScaler()
scaler

### Column Transformer

In [34]:
#Columns
numerical_cols = X_train.select_dtypes('number').columns
numerical_cols

Index(['age', 'bmi', 'children'], dtype='object')

In [35]:
nominal_cols = X_train.select_dtypes('object').columns
nominal_cols

Index(['sex', 'smoker', 'region'], dtype='object')

In [36]:
#Nominal Tuple
nominal_tuple = ('nominal', ohe_encoder, nominal_cols)
nominal_tuple

('nominal',
 OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 Index(['sex', 'smoker', 'region'], dtype='object'))

In [37]:
#Numerical Tuple
numerical_tuple = ('numerical', scaler, numerical_cols)
numerical_tuple

('numerical',
 StandardScaler(),
 Index(['age', 'bmi', 'children'], dtype='object'))

In [38]:
col_transformer = ColumnTransformer([nominal_tuple,numerical_tuple],
                                    verbose_feature_names_out=False)
col_transformer

## Linear Regression Model (Model Pipeline)
. Instantiate a linear regression model.

. Create a model pipeline with the preprocessor first and the linear regression model last.

In [39]:
# Instantiate a linear regression model
linreg = LinearRegression()
# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
linreg_pipe = make_pipeline(col_transformer, linreg)
linreg_pipe

## Fit Linear Regression Model

In [40]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1002 entries, 763 to 1127
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1002 non-null   int64  
 1   sex       1002 non-null   object 
 2   bmi       1002 non-null   float64
 3   children  1002 non-null   int64  
 4   smoker    1002 non-null   object 
 5   region    1002 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 54.8+ KB


In [41]:
linreg_pipe.fit(X_train, y_train)

## Evaluate Linear Regression Model

In [44]:
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  # rmse = mean_squared_error(y_true, y_pred, squared=False)
  rmse = mean_squared_error(y_true, y_pred)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [45]:
evaluate_regression(linreg_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 4,207.729
- MSE = 37,182,190.657
- RMSE = 37,182,190.657
- R^2 = 0.730

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 4,069.039
- MSE = 35,283,922.771
- RMSE = 35,283,922.771
- R^2 = 0.796
