In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [None]:
df.info() #to check if there's null values

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
sns.histplot(df['charges'], kde=True)

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
# One-Hot Encoding for Categorical Variables
Cate_cols = ['sex', 'smoker', 'region']
df = pd.get_dummies(df, columns= Cate_cols, drop_first=True)

In [None]:
#defining features and targets 
X = df.drop('charges',axis=1)

In [None]:
y=df['charges']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
#Scaling features
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

In [None]:
y_pred= model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error 
rmse_linear = np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
rmse_linear

#### Calculate and plot residuals

In [None]:
test_residuals = y_test - y_pred

plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=test_residuals, color='purple', alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)

plt.title("Residual Plot")
plt.xlabel("Actual Charges")
plt.ylabel("Residuals (Actual - Predicted)")
plt.grid(True)
plt.show()

#### Conclusion
*The residual plot shows that our linear regression model is not predicting well for people with very high medical charges. The difference between the actual and predicted values becomes bigger as the charges increase. This means the model is not able to capture some patterns in the data, especially for higher values. So, a more advanced model like polynomial regression or decision trees might work better.*

# **Polynomial Regression**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Checking best degree 

In [None]:
X.head()

In [None]:
# Since X is already scaled 
train_rmse_error = []
test_rmse_error = []

In [None]:
for d in range(1,10):
    polynomial_converter = PolynomialFeatures(degree=d,include_bias = False)
    poly_feature = polynomial_converter.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(poly_feature, y, test_size=0.3, random_state=101)
    
    model = LinearRegression()
    model.fit(X_train , y_train)

    train_predi = model.predict(X_train)
    test_predi = model.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train,train_predi))
    test_rmse = np.sqrt(mean_squared_error(y_test,test_predi))

    train_rmse_error.append(train_rmse)
    test_rmse_error.append(test_rmse)

### Visualize

In [None]:
plt.plot(train_rmse_error,label='TRAIN RMSE') 
plt.plot(test_rmse_error,label='TEST RMSE')

plt.ylabel('RMSE')
plt.xlabel('Degree of Poly')
plt.legend()      #plt.legend() tells matplotlib to show a box with those labels

In [None]:
test_rmse_error

In [None]:
train_rmse_error

In [None]:
# Create DataFrame
rmse_df = pd.DataFrame({
    'Degree': list(range(1, 10)),
    'Train_RMSE': train_rmse_error,
    'Test_RMSE': test_rmse_error
})

# Display
rmse_df

* Degree 2 provides the best fit.
* It achieves a low test RMSE without overfitting, unlike higher degrees which reduce training error but significantly increase testing error, indicating overfitting.

#### **Conclusion:** *Polynomial Regression (Degree 2) performed better than Linear Regression (Degree 1 by achieving a lower RMSE on the test set.*

# K-Nearest Neighbors (KNN) 

In [None]:
df.head()

In [None]:
#defining deatures and targets 
X = df.drop('charges',axis=1)

In [None]:
y=df['charges']

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
#Scaling features
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

### Choosing the best K
* KNeighborsClassifier → for classification tasks
* KNeighborsRegressor → for regression tasks

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

test_rmse_errors = []

for k in range(1, 30):
    knn_model = KNeighborsRegressor(n_neighbors=k)
    knn_model.fit(scaled_X_train, y_train)
    y_pred_test = knn_model.predict(scaled_X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_rmse_errors.append(rmse)

### Plot Error vs. K

In [None]:
plt.figure(figsize=(10,6),dpi=200)
plt.plot(range(1,30),test_rmse_errors,label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")

In [None]:
test_rmse_errors

Once you have a good idea of which range of k works best (say, 5–10), now you:
* Automate finding the best k.
* Use cross-validation for more reliable evaluation.
* Combine scaling + modeling in one unit (Pipeline).

### Pipeline + GridSearchCV for KNN Regression

In [None]:
scaler = StandardScaler()
knn = KNeighborsRegressor()

operations = [('scaler',scaler),('knn',knn)]
from sklearn.pipeline import Pipeline
pipe = Pipeline(operations)

In [None]:
from sklearn.model_selection import GridSearchCV

k_values = list(range(1, 20))
param_grid = {'knn__n_neighbors': k_values}

grid_model = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_model.fit(X_train, y_train)

### 1. Check the Best Parameters Found by GridSearch

In [None]:
print("Best k value:", grid_model.best_params_)

### View the Best Score (lowest RMSE)

In [None]:
print("Best CV RMSE:", -grid_model.best_score_)

### Make Predictions on Test Set

In [None]:
y_pred = grid_model.predict(X_test)

### Evaluate Model Performance on Test Set

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

knn_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
knn_r2 = r2_score(y_test, y_pred)

print("Test RMSE:", knn_rmse)
print("Test R² Score:", knn_r2)

### Calculate and plot Residuals

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(10,6), dpi=100)
plt.scatter(y_test, residuals, color='purple', alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs Actual Values")
plt.grid(True)
plt.show()

#### **Conclusion**: model is good at predicting small numbers, but it's very bad at predicting large numbers. The bigger the true value is, the bigger the potential error of your model's prediction will be.

# **Support Vector Regression (SVR)**

In [None]:
df.head()

In [None]:
#defining deatures and targets 
X = df.drop('charges',axis=1)
y=df['charges']

In [None]:
X.head()

In [None]:
#Train - test - split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

### Pipeline SetUp

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

# Pipeline: Scaling + SVR
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

We use a Pipeline to combine scaling and modeling steps.
It helps in:

* Keeping code clean and organized
* Preventing data leakage during scaling
* Ensuring proper preprocessing during cross-validation
* Making GridSearchCV easier to apply

### GridSearchCV with RMSE Scoing
We use `GridSearchCV` to try different combinations of hyperparameters.  
It applies **5-fold cross-validation** and selects the best performing model based on **Root Mean Squared Error (RMSE)**.

In a Pipeline, you must use the format **'stepname__parameter'** when specifying parameters in param_grid for GridSearchCV.
The prefix **svr__** comes from the name you gave to that step in the pipeline ('svr'), followed by __ and the parameter name.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'svr__C': [0.001, 0.01, 0.1, 0.5, 1],
    'svr__kernel': ['linear', 'rbf', 'poly'],
    'svr__gamma': ['scale', 'auto'],
    'svr__degree': [2, 3, 4],
    'svr__epsilon': [0, 0.01, 0.1, 0.5, 1, 2]
}

grid = GridSearchCV(pipe, param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid.fit(X_train, y_train)

### cv=5, it means:
* The training data is split into 5 equal parts (folds).
* The model is trained on 4 parts and tested on the remaining 1 part.
* This process is repeated 5 times, each time with a different test fold.
* The final performance score is the average across all 5 runs.

### Why `cv=5`?
- Performs 5-fold cross-validation for more **reliable model evaluation**.
- Helps reduce overfitting and gives better hyperparameter tuning.
- Not mandatory, but **highly recommended**.

### scoring='neg_root_mean_squared_error'
Scikit-learn’s GridSearchCV tries to maximize the scoring value to choose the best model.
However, lower RMSE (Root Mean Squared Error) actually means better performance.
We are still minimizing RMSE, just expressed in a way that works with scikit-learn's logic.

### Evaluate Best Model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Best Estimator
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# Evaluation
mae_svr = mean_absolute_error(y_test, y_pred)
rmse_svr = np.sqrt(mean_squared_error(y_test, y_pred))

print("Best Parameters:", grid.best_params_)
print("MAE:", mae_svr)
print("RMSE:", rmse_svr)

#### What we found:
* The linear kernel performed best among the tested ones (linear, rbf, poly).
* The model's predictions deviate by about ₹11.8k on average. Given the mean charge (~₹13.2k), the error is moderate. SVR performs decently but there’s room for improvement.

### Calculate and Plot residuals

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(10,6), dpi=100)
plt.scatter(y_test, residuals, color='purple', alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs Actual Values")
plt.grid(True)
plt.show()

#### **Conclusion:**
The residuals are *not randomly scattered around zero* — instead, they form a clear linear pattern.
This indicates:
* The SVR model is consistently underpredicting for high actual values.
* There may be non-linear relationships in the data that the current SVR model (likely with a linear kernel) isn't capturing effectively.

A g**ood model should show no pattern in residuals** (random scatter), which suggests constant error variance and that the model fits the data well.