In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [35]:
df = pd.read_csv("/content/drive/MyDrive/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [36]:
df.shape

(1338, 7)

In [37]:
df.columns.tolist()

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

In [38]:
# Removing Duplicates
df = df.drop_duplicates()
print("Remaining rows after removing duplicates:", len(df))

Remaining rows after removing duplicates: 1337


In [39]:
# Checks the Missing Values
df.isnull().sum()
# If there are any missing values it will fill according tot his
num_cols = df.select_dtypes(include='number').columns   # Fill numeric columns with median
for col in num_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

cat_cols = df.select_dtypes(include='object').columns    # Fill categorical columns with mode
for col in cat_cols:
    mode_val = df[col].mode()[0]
    df[col] = df[col].fillna(mode_val)

In [40]:
# Converts Uppercase into lowercase
df['sex'] = df['sex'].str.strip().str.lower()
# deatil description of columns
df['sex'] = df['sex'].replace({
    'm': 'male',
    'f': 'female'
})

df['region'] = df['region'].str.strip().str.lower()
df['smoker'] = df['smoker'].str.strip().str.lower()
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [41]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [42]:
# This method makes outlier effect less on the model
df['charges'] = np.log1p(df['charges'])

In [43]:
# encoding
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,9.734236,False,True,False,False,True
1,18,33.77,1,7.453882,True,False,False,True,False
2,28,33.0,3,8.400763,True,False,False,True,False
3,33,22.705,0,9.998137,True,False,True,False,False
4,32,28.88,0,8.260455,True,False,True,False,False


In [44]:
X = df.drop('charges', axis=1)
y = df['charges']
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Feature Selection Strategy
mi_scores = mutual_info_regression(X_train, y_train)

# Create a DataFrame for easy viewing
mi_df = pd.DataFrame({
    'Feature': X_train.columns,
    'MI Score': mi_scores
})

# Sort by score
mi_df = mi_df.sort_values(by='MI Score', ascending=False)

Important_features = mi_df['Feature'].head(10).tolist()
print("Important_features:",Important_features)

Important_features: ['age', 'smoker_yes', 'children', 'sex_male', 'bmi', 'region_northwest', 'region_southeast', 'region_southwest']


In [46]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

R² Score: 0.829478626133356


In [47]:
X_new = df[Important_features]
X_train_1, X_test_1, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train_1, y_train)

y_pred = model.predict(X_test_1)
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

R² Score: 0.8294786261333561


In [48]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))

R² Score: 0.8431171001380018


In [49]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_1, y_train)

y_pred = model.predict(X_test_1)
print("R² Score:", r2_score(y_test, y_pred))

R² Score: 0.8431475124642963
