In [177]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [179]:
df = pd.read_csv('insurance.csv')

In [181]:
#One hot Encoding the gender column
#drop_first=True dropping newly created column

#df = pd.get_dummies(df, columns = ['gender'], drop_first=True)


#Label encode the gender column
# le = LabelEncoder()
# df['gender'] = le.fit_transform(df['gender'])  # Encodes Gender as 0 -Female and 1 - Male
# print(df.head())

#Frequency Encoding
gender_counts = df['gender'].value_counts(normalize=True)  #Calculate frequencies
print(gender_counts)
df['gender'] = df['gender'].map(gender_counts) #Replace categories with frequencies
df.head()

gender
male      0.505232
female    0.494768
Name: proportion, dtype: float64


Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,0.494768,27.9,0,yes,southwest,16884.924
1,18,0.505232,33.77,1,no,southeast,1725.5523
2,28,0.505232,33.0,3,no,southeast,4449.462
3,33,0.505232,22.705,0,no,northwest,21984.47061
4,32,0.505232,28.88,0,no,northwest,3866.8552


In [183]:
#Extract relevant columns
X = df[['age', 'bmi', 'gender']].values   #Predictor
y = df['charges'].values


In [185]:
#COmmented to reduce complexity

#Apply Standard Scalar to predictors
#scalar = StandardScaler()
    #scalar = MinMaxScaler()
#print(df[['age', 'bmi']])
#x_scaled = scalar.fit_transform(X)
#print(x_scaled)

In [187]:
#Split dataset in to training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X,y,test_size=0.2 , random_state=42)

In [189]:
#Fit the linear regression model
model = LinearRegression()
y_pred = model.fit(X_train, Y_train)

In [191]:
#Make predictions o test set
y_pred = model.predict(X_test)

In [193]:

#Metrics and interpretation
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, y_pred)

In [195]:
print("model Coefficients")
print(f" Slope (Coefficient for Carat):  {model.coef_[0]}")
print(f" Intercept : {model.intercept_}")
print(f"\nModel Performance Metrics")
print(f" Mean Squared Error (MSE) {mse}")
print(f"Root  Mean Squared Error (RMSE) {rmse}")
print(f"R^2  Score (Explained Variance) : {r2}")

model Coefficients
 Slope (Coefficient for Carat):  224.29893746636694
 Intercept : -70436.15295180352

Model Performance Metrics
 Mean Squared Error (MSE) 131043916.53369787
Root  Mean Squared Error (RMSE) 11447.441484178806
R^2  Score (Explained Variance) : 0.15590990487003253


In [197]:
#Print predicted vs actual values for the test set
print("\nPredicted vs actual prices (Test set):")
for actual,predicted in zip(Y_test,y_pred):
    print(f" Actual : {actual:.2f} , Predicted : {predicted:.2f}")


Predicted vs actual prices (Test set):
 Actual : 9095.07 , Predicted : 12144.93
 Actual : 5272.18 , Predicted : 11719.94
 Actual : 29330.98 , Predicted : 16969.09
 Actual : 9301.89 , Predicted : 13914.56
 Actual : 33750.29 , Predicted : 9889.68
 Actual : 4536.26 , Predicted : 16865.90
 Actual : 2117.34 , Predicted : 5438.18
 Actual : 14210.54 , Predicted : 21951.83
 Actual : 3732.63 , Predicted : 5738.17
 Actual : 10264.44 , Predicted : 15556.18
 Actual : 18259.22 , Predicted : 9883.36
 Actual : 7256.72 , Predicted : 14386.77
 Actual : 3947.41 , Predicted : 10008.40
 Actual : 46151.12 , Predicted : 19376.58
 Actual : 48673.56 , Predicted : 22046.38
 Actual : 44202.65 , Predicted : 18479.20
 Actual : 9800.89 , Predicted : 18800.18
 Actual : 42969.85 , Predicted : 16361.51
 Actual : 8233.10 , Predicted : 13314.92
 Actual : 21774.32 , Predicted : 12258.81
 Actual : 5080.10 , Predicted : 8597.40
 Actual : 7441.50 , Predicted : 15496.54
 Actual : 1256.30 , Predicted : 9389.70
 Actual : 275

In [199]:
#Cannot plot as we are doing Multiple linear regression

# plt.figure(figsize=(8,6))
# plt.scatter(X_test,Y_test,color='blue',label='Actual Prices')
# plt.plot(X_test,y_pred,color='red',linewidth=2,label="Regression Line")
# plt.xlabel("Carat")
# plt.ylabel("Price")
# plt.title("Linear Regression: Actual Prices vs Carat")
# plt.legend()
# plt.grid()

