In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [3]:
# 1. Load dataset

df = pd.read_csv(r"D:\Python Projects\4-ML-Projects\P2_Insurance_Premium_Prediction\Data\insurance.csv")

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.shape

(1338, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:

# 2. Encode categorical variables
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df['region'] = df['region'].map({'northwest': 0, 'northeast': 1, 'southeast': 2, 'southwest': 3})


In [8]:
df.isnull().sum()


age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
df.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


In [10]:

# 2. Encode categorical variables
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df['region'] = df['region'].map({'northwest': 0, 'northeast': 1, 'southeast': 2, 'southwest': 3})


In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,3,16884.924
1,18,0,33.77,1,0,2,1725.5523
2,28,0,33.0,3,0,2,4449.462
3,33,0,22.705,0,0,0,21984.47061
4,32,0,28.88,0,0,0,3866.8552


In [12]:
# 3. Features and target
X = df.drop('charges', axis=1)
y = df['charges']

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# 5. Define models
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Regression': SVR(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, verbosity=0)
}

In [14]:
# 6. Train & evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    train_score = r2_score(y_train, y_pred_train)
    test_score = r2_score(y_test, y_pred_test)

    results.append({
        'Model': name,
        'Train R2': round(train_score, 3),
        'Test R2': round(test_score, 3)
    })

In [15]:
# 7. Create results DataFrame
results_df = pd.DataFrame(results)


In [16]:
results_df

Unnamed: 0,Model,Train R2,Test R2
0,Linear Regression,0.741,0.783
1,Support Vector Regression,-0.098,-0.072
2,Random Forest,0.974,0.863
3,Gradient Boosting,0.899,0.88
4,XGBoost,0.994,0.845


In [17]:
# 8. Rank by Test R2
ranked = results_df.sort_values(by='Test R2', ascending=False).reset_index(drop=True)

In [18]:
# 9. Print results and top 3
print("\nModel Performance (sorted by Test R2):")
print(ranked)

print("\n🏆 Top 3 Models:")
print(ranked.head(3))


Model Performance (sorted by Test R2):
                       Model  Train R2  Test R2
0          Gradient Boosting     0.899    0.880
1              Random Forest     0.974    0.863
2                    XGBoost     0.994    0.845
3          Linear Regression     0.741    0.783
4  Support Vector Regression    -0.098   -0.072

🏆 Top 3 Models:
               Model  Train R2  Test R2
0  Gradient Boosting     0.899    0.880
1      Random Forest     0.974    0.863
2            XGBoost     0.994    0.845


In [19]:
new_data = pd.DataFrame([
    [25, 1, 28.5, 0, 0, 2],   # young, female, non-smoker, southeast
    [45, 0, 35.1, 2, 1, 1],   # middle-aged, male, smoker, northeast
    [34, 0, 26.7, 1, 0, 0],   # male, northwest
    [52, 1, 29.3, 3, 1, 3],   # older female smoker, southwest
    [23, 1, 31.2, 0, 0, 2],   # young female, southeast
    [40, 0, 30.0, 2, 0, 1],   # male, northeast
    [60, 1, 27.5, 1, 1, 0],   # older female smoker, northwest
    [30, 0, 33.0, 0, 0, 3],   # male, southwest
    [37, 1, 25.5, 2, 0, 2],   # female, southeast
    [50, 0, 29.8, 4, 1, 1]    # older male, smoker, northeast
], columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])

In [21]:
# Predict charges for new data
predictions = models['Gradient Boosting'].predict(new_data)

# Combine input and predicted outputs
new_data_with_predictions = new_data.copy()
new_data_with_predictions['Predicted Charges'] = predictions

# Show results
print(new_data_with_predictions)

   age  sex   bmi  children  smoker  region  Predicted Charges
0   25    1  28.5         0       0       2        4368.196225
1   45    0  35.1         2       1       1       43119.121042
2   34    0  26.7         1       0       0        6635.667224
3   52    1  29.3         3       1       3       26572.034336
4   23    1  31.2         0       0       2        4620.760310
5   40    0  30.0         2       0       1        8039.802941
6   60    1  27.5         1       1       0       27769.760527
7   30    0  33.0         0       0       3        3554.058061
8   37    1  25.5         2       0       2        8142.638405
9   50    0  29.8         4       1       1       27697.176196


In [22]:
# Predict charges for new data
predictions = models['Random Forest'].predict(new_data)

# Combine input and predicted outputs
new_data_with_predictions = new_data.copy()
new_data_with_predictions['Predicted Charges'] = predictions

# Show results
print(new_data_with_predictions)

   age  sex   bmi  children  smoker  region  Predicted Charges
0   25    1  28.5         0       0       2        3105.463017
1   45    0  35.1         2       1       1       42809.091589
2   34    0  26.7         1       0       0        5269.914722
3   52    1  29.3         3       1       3       25973.090595
4   23    1  31.2         0       0       2        5616.925069
5   40    0  30.0         2       0       1        7167.138692
6   60    1  27.5         1       1       0       27867.532050
7   30    0  33.0         0       0       3        4377.103711
8   37    1  25.5         2       0       2        7964.861195
9   50    0  29.8         4       1       1       26634.650206
