## Buiseness Problem Understanding

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

In [2]:
df=pd.read_excel("insurance.xlsx")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


## Data Understanding

In [3]:
df.shape

(1338, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
discrete_categorical=[]
discrete_count=[]
continuous=[]

d_types=dict(df.dtypes)

for i, type in d_types.items():
    if str(type)=="object":
        discrete_categorical.append(i)
    elif str(type)=="float64":
        continuous.append(i)
    else:
        discrete_count.append(i)

print(f"Categorical : {discrete_categorical}")
print(f"Count : {discrete_count}")
print(f"Continuous : {continuous}")

Categorical : ['sex', 'smoker', 'region']
Count : ['age', 'children']
Continuous : ['bmi', 'expenses']


In [6]:
df[discrete_categorical].value_counts()

sex     smoker  region   
female  no      southwest    141
                southeast    139
                northwest    135
male    no      southeast    134
female  no      northeast    132
male    no      northwest    132
                southwest    126
                northeast    125
        yes     southeast     55
                northeast     38
                southwest     37
female  yes     southeast     36
                northeast     29
                northwest     29
male    yes     northwest     29
female  yes     southwest     21
Name: count, dtype: int64

In [7]:
df[discrete_categorical].nunique()

sex       2
smoker    2
region    4
dtype: int64

In [8]:
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [9]:
df[continuous].corr()

Unnamed: 0,bmi,expenses
bmi,1.0,0.198576
expenses,0.198576,1.0


## Data Preprocessing

In [10]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [11]:
df.duplicated().sum()

1

In [12]:
df[df.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
581,19,male,30.6,0,no,northwest,1639.56


In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df.duplicated().sum()


0

In [15]:
df.drop('region',axis=1,inplace=True)

In [16]:
df['sex'].replace({'female':0,'male':1},inplace=True)
df['smoker'].replace({'no':0,'yes':1},inplace=True)

In [17]:
X=df.drop('expenses',axis=1)
y=df['expenses']

#### train-test-split

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=9)

## Modelling & Evaluation

In [19]:
from sklearn.linear_model import ElasticNet
model=ElasticNet()
model.fit(X_train,y_train)
ypred_train=model.predict(X_train)
ypred_test=model.predict(X_test)

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

print(f"Train r2 score is {(r2_score(y_train,ypred_train))}")
print(f"Test r2 score is {(r2_score(y_test,ypred_test))}")
print(f"CV score is {(cross_val_score(model,X_train,y_train,cv=5)).mean()}")

Train r2 score is 0.4040370376357658
Test r2 score is 0.36842833194908875
CV score is 0.39832534576079165


In [20]:
from sklearn.model_selection import GridSearchCV
estimator=ElasticNet()

param_grid={"alpha":[0.1,0.2,1,2,3,5,10],"l1_ratio":[0.1,0.5,0.75,0.9,0.95,1]}
model=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model.fit(X_train,y_train)
model.best_params_

{'alpha': 10, 'l1_ratio': 1}

In [23]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

model=ElasticNet(alpha=10,l1_ratio=1)
model.fit(X_train,y_train)
ypred_train=model.predict(X_train)
ypred_test=model.predict(X_test)

print(f"Intercept {model.intercept_}")
print(f"Coefficient {model.coef_}")

print(f"Train r2 score is {r2_score(y_train,ypred_train)}")
print(f"Test r2 score is {r2_score(y_test,ypred_test)}")
print(f"CV score is {(cross_val_score(model,X_train,y_train,cv=5)).mean()}")

Intercept -12145.944081374624
Coefficient [  264.53004065   -76.75779933   318.16338722   405.3431035
 23924.51240129]
Train r2 score is 0.7593760721703684
Test r2 score is 0.7006642291490784
CV score is 0.7535351023878463


## Prediction

In [29]:
input_data={"age":31,
            "sex":"female",
            "bmi":25.74,
            "children":0,
            "smoker":"no",
            "region":'northeast'}

In [30]:
df_test=pd.DataFrame(input_data,index=[0])
df_test.drop('region',axis=1,inplace=True)
df_test['sex'].replace({"female":0,"male":1},inplace=True)
df_test['smoker'].replace({"no":0,"yes":1},inplace=True)

In [31]:
model.predict(df_test)

array([4244.01276589])

## Conclusion

### Model Performance
- **Train R² Score**: **0.7594** – The ElasticNet model explains approximately **75.94%** of the variance in the training data.
- **Test R² Score**: **0.7007** – On unseen data, the model captures around **70.07%** of the variance, indicating good generalizability.
- **Cross-Validation (CV) Score**: **0.7535** – The model maintains consistent performance across different subsets, reducing overfitting concerns.

### Key Takeaways
- The model effectively balances **L1 (Lasso)** and **L2 (Ridge)** regularization, controlling multicollinearity while selecting important features.
- The close alignment between train, test, and cross-validation scores suggests a well-regularized model with minimal overfitting.
- The model's ability to generalize well makes it suitable for predicting future data points with reasonable confidence.


Overall, ElasticNet proved to be an effective model for this dataset, leveraging both feature selection and regularization to achieve balanced performance.
