## Exploratory Data Analysis

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('insurance.csv')

In [None]:
df.info()
df.isnull().sum()

In [None]:
df.describe()

In [None]:
cols = ['age', 'bmi', 'children', 'charges']

for col in cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=20)

In [None]:
sns.countplot(x=df['sex'])

In [None]:
sns.countplot(x=df['smoker'])

In [None]:
for col in cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True)

## Data cleaning and preprocesing

In [None]:
df_cleaned = df.copy()

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.drop_duplicates(inplace=True)

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.isnull().sum()

In [None]:
df_cleaned.dtypes

In [None]:
df_cleaned['sex'].value_counts()

In [None]:
df_cleaned['sex'] = df_cleaned['sex'].map({'male': 0, 'female' : 1})

In [None]:
df_cleaned['smoker'] = df_cleaned['smoker'].map({'yes' : 1, 'no' : 0})

In [None]:
df_cleaned.rename(columns={
    'sex' : 'is_female',
    'smoker' : 'is_smoker'
}, inplace=True)

In [None]:
df_cleaned['region'].value_counts()

In [None]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['region'], drop_first=True)

In [None]:
df_cleaned = df_cleaned.astype(int)

In [None]:
df_cleaned

## Feature Engineering and Extraction

In [None]:
sns.histplot(x=df_cleaned['bmi'], kde=True)

In [None]:
df_cleaned['bmi_category'] = pd.cut(
    df_cleaned['bmi'],
    bins=[0, 18.5, 24.9, 29.9, float('inf')],
    labels=['Underweight','Normal', 'Overweight', 'Obese']
)

In [None]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['bmi_category'], drop_first=True)

In [None]:
df_cleaned = df_cleaned.astype(int)

In [None]:
scaler = StandardScaler()
cols = ['age', 'bmi', 'children']

df_cleaned[cols] = scaler.fit_transform(df_cleaned[cols])

In [None]:
df_cleaned.head()

In [None]:
from scipy.stats import pearsonr

selected_features = ['age', 'is_female', 'bmi', 'children', 'is_smoker', 'charges', 
                    'region_northwest', 'region_southeast', 'region_southwest',
                    'bmi_category_Normal', 'bmi_category_Overweight', 'bmi_category_Obese']

correlation = {
    feature : pearsonr(df_cleaned[feature], df_cleaned['charges'])[0]
    for feature in selected_features
}

correlation_df = pd.DataFrame(list(correlation.items()), columns=['Feature', 'Pearson_Column'])

correlation_df.sort_values(by='Pearson_Column' ,ascending=False)

In [None]:
cat_features = ['is_female', 'is_smoker', 
                    'region_northwest', 'region_southeast', 'region_southwest',
                    'bmi_category_Normal', 'bmi_category_Overweight', 'bmi_category_Obese']

In [None]:
from scipy.stats import chi2_contingency
alpha = 0.05

df_cleaned['charges_bin'] = pd.qcut(df_cleaned['charges'], q=4, labels=False)
chi2_results = {}

for col in cat_features:
    contingency = pd.crosstab(df_cleaned[col], df_cleaned['charges_bin'])
    chi2_stat, p_val, _, _ = chi2_contingency(contingency)
    decision = 'Reject Null (Keep Feature)' if p_val < alpha else 'Accept Null (Drop Feature)'
    chi2_results[col] = {
        'chi2_statistic': chi2_stat,
        'p_value': p_val,
        'Decision': decision
    }

chi2_df = pd.DataFrame(chi2_results).T
chi2_df = chi2_df.sort_values(by='p_value')
chi2_df

In [None]:
final_df = df_cleaned[['age', 'is_female', 'bmi', 'children', 'is_smoker', 'charges',    'region_southeast', 'bmi_category_Obese']]

final_df.head(10)