In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

In [None]:
df = pd.read_csv('insurance.csv')

## Encoding Categorical Features (one-hot encoding)

In [None]:
# One-Hot Encode
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

In [None]:
df_encoded.head()

## Training Model (Linear Regression Model)

In [None]:
y = df_encoded['charges']
X = df_encoded.drop(columns=['charges'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, y_test.shape

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

## Evaluation

In [None]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE:", mae)
print("RMSE:", rmse)

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.title("Actual vs Predicted Claim Amounts")
plt.xlabel("Actual Claim Amount")
plt.ylabel("Predicted Claim Amount")

# reference line
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color='red')

plt.show()

## Conclusion

The model successfully learned a positive relationship between features and insurance charges. The visualization shows an upward trend and reasonable alignment with the diagonal line, meaning predictions improved significantly compared to constant-average outputs before. While some spread remains for higher charge values, the model demonstrates a working regression pipeline and trend learning, fulfilling the internship task requirements.