In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import requests


# Define paths
drive_path = '/content/drive/My Drive/'
dataset_path = os.path.join(drive_path, 'insurance-2.csv')

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv(dataset_path)

In [8]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [11]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1000.0,1000.0,1000.0,1000.0
mean,39.615,30.86338,1.08,13075.755883
std,14.153908,6.04744,1.198765,11985.924552
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.6,0.0,4719.683425
50%,40.0,30.59,1.0,9283.0213
75%,52.0,35.1125,2.0,15882.795438
max,64.0,50.38,5.0,63770.42801


In [13]:
# Perform one-hot encoding on categorical variables
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
print("\nDataFrame after One-Hot Encoding:")
print(df_encoded)


DataFrame after One-Hot Encoding:
     age     bmi  children      charges  sex_male  smoker_yes  \
0     19  27.900         0  16884.92400     False        True   
1     18  33.770         1   1725.55230      True       False   
2     28  33.000         3   4449.46200      True       False   
3     33  22.705         0  21984.47061      True       False   
4     32  28.880         0   3866.85520      True       False   
..   ...     ...       ...          ...       ...         ...   
995   39  23.275         3   7986.47525     False       False   
996   39  34.100         3   7418.52200     False       False   
997   63  36.850         0  13887.96850     False       False   
998   33  36.290         3   6551.75010     False       False   
999   36  26.885         0   5267.81815     False       False   

     region_northwest  region_southeast  region_southwest  
0               False             False              True  
1               False              True             False  
2   

In [14]:
df_encoded.corr()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
age,1.0,0.116272,0.031256,0.330647,-0.049141,-0.004548,-0.003197,-0.02524,0.019905
bmi,0.116272,1.0,0.02576,0.18847,0.009645,-0.013054,-0.141397,0.257731,-0.00692
children,0.031256,0.02576,1.0,0.05263,0.021033,-0.003532,-0.012832,-0.028391,0.024252
charges,0.330647,0.18847,0.05263,1.0,0.039561,0.784477,-0.040597,0.055064,-0.049075
sex_male,-0.049141,0.009645,0.021033,0.039561,1.0,0.080716,0.020619,-0.024063,0.003632
smoker_yes,-0.004548,-0.013054,-0.003532,0.784477,0.080716,1.0,-0.037511,0.059107,-0.040025
region_northwest,-0.003197,-0.141397,-0.012832,-0.040597,0.020619,-0.037511,1.0,-0.340092,-0.31137
region_southeast,-0.02524,0.257731,-0.028391,0.055064,-0.024063,0.059107,-0.340092,1.0,-0.352523
region_southwest,0.019905,-0.00692,0.024252,-0.049075,0.003632,-0.040025,-0.31137,-0.352523,1.0


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=90, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R²': r2
    }
results_df = pd.DataFrame(results).T
print(results_df)

                                 MAE           MSE         RMSE        R²
Linear Regression        4284.996506  3.782582e+07  6150.270305  0.806320
Random Forest Regressor  2468.500534  2.285973e+07  4781.185453  0.882951
