In [None]:

# Step 1: Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
import requests


In [None]:

# Step 2: Download the Dataset
def download(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)

path = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/medical_insurance_dataset.csv'
file_name = "insurance.csv"
download(path, file_name)


In [None]:

# Step 3: Load and Prepare the Data
df = pd.read_csv(file_name, header=None)
headers = ["age", "gender", "bmi", "no_of_children", "smoker", "region", "charges"]
df.columns = headers
df.replace('?', np.nan, inplace=True)
df.info()


In [None]:

# Step 4: Handle Missing Values and Update Data Types
is_smoker = df['smoker'].value_counts().idxmax()
df["smoker"].replace(np.nan, is_smoker, inplace=True)

mean_age = df['age'].astype('float').mean(axis=0)
df["age"].replace(np.nan, mean_age, inplace=True)

df[["age", "smoker"]] = df[["age", "smoker"]].astype(int)
df.info()


In [None]:

# Step 5: Round Charges and Display the Data
df[["charges"]] = np.round(df[["charges"]], 2)
df.head()


In [None]:

# Step 6: Data Visualization - Regression Plot of Charges vs BMI
sns.regplot(x='bmi', y='charges', data=df, line_kws={"color":"red"})
plt.title('Regression Plot of Charges vs BMI')
plt.ylim(0,)
plt.show()


In [None]:

# Step 6: Data Visualization - Box Plot of Charges by Smoker Status
sns.boxplot(x='smoker', y='charges', data=df)
plt.title('Box Plot of Charges by Smoker Status')
plt.show()


In [None]:

# Step 7: Correlation Matrix
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [None]:

# Step 8: Simple Linear Regression (Charges by Smoker)
X = df[['smoker']]
Y = df[['charges']]
lm = LinearRegression()
lm.fit(X, Y)
print("R^2 score for charges prediction using smoker:", lm.score(X, Y))


In [None]:

# Step 9: Multiple Linear Regression (Charges by All Attributes)
Z = df[["age", "gender", "bmi", "no_of_children", "smoker", "region"]]
lm.fit(Z, Y)
print("R^2 score for charges prediction using all attributes:", lm.score(Z, Y))


In [None]:

# Step 10: Polynomial Features and Pipeline
Input = [('scale', StandardScaler()), ('polynomial', PolynomialFeatures(include_bias=False)), ('model', LinearRegression())]
pipe = Pipeline(Input)
Z = Z.astype(float)
pipe.fit(Z, Y)
ypipe = pipe.predict(Z)
print("R^2 score for charges prediction using polynomial features:", r2_score(Y, ypipe))


In [None]:

# Step 11: Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(Z, Y, test_size=0.2, random_state=1)


In [None]:

# Step 12: Ridge Regression
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(x_train, y_train)
yhat = ridge_model.predict(x_test)
print("R^2 score for Ridge regression on test data:", r2_score(y_test, yhat))


In [None]:

# Step 13: Polynomial Transformation and Ridge Regression
pr = PolynomialFeatures(degree=2)
x_train_pr = pr.fit_transform(x_train)
x_test_pr = pr.transform(x_test)
ridge_model.fit(x_train_pr, y_train)
y_hat = ridge_model.predict(x_test_pr)
print("R^2 score for Ridge regression with polynomial features on test data:", r2_score(y_test, y_hat))
