<a href="https://colab.research.google.com/github/Santimalli-Gowthami/RPS-by-Gowthami/blob/main/fcc_predict_health_costs_with_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
# Health Costs Calculator - Complete Solution
# This code should be added between cell 2 and cell 3 in your Google Colab notebook

# ============================================
# STEP 1: DATA EXPLORATION AND PREPROCESSING
# ============================================

# Let's first look at the data
print("Dataset shape:", dataset.shape)
print("\nDataset info:")
print(dataset.info())
print("\nFirst few rows:")
print(dataset.head())
print("\nStatistical summary:")
print(dataset.describe())

# ============================================
# STEP 2: CONVERT CATEGORICAL DATA TO NUMBERS
# ============================================

# Convert categorical columns to numerical values
# 'sex': male/female -> 0/1
dataset['sex'] = dataset['sex'].map({'female': 0, 'male': 1})

# 'smoker': yes/no -> 0/1
dataset['smoker'] = dataset['smoker'].map({'no': 0, 'yes': 1})

# 'region': convert to categorical codes (one-hot encoding would be better but let's keep it simple)
dataset['region'] = dataset['region'].astype('category').cat.codes

print("\nDataset after encoding:")
print(dataset.head())

# ============================================
# STEP 3: SPLIT DATA INTO TRAIN AND TEST SETS
# ============================================

# Shuffle the dataset
dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Split: 80% train, 20% test
train_size = int(0.8 * len(dataset))

train_dataset = dataset[:train_size].copy()
test_dataset = dataset[train_size:].copy()

print(f"\nTrain dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# ============================================
# STEP 4: SEPARATE LABELS FROM FEATURES
# ============================================

# Pop off the 'expenses' column to create labels
train_labels = train_dataset.pop('expenses')
test_labels = test_dataset.pop('expenses')

print(f"\nTrain features shape: {train_dataset.shape}")
print(f"Train labels shape: {train_labels.shape}")

# ============================================
# STEP 5: NORMALIZE THE DATA
# ============================================

# Normalization is important for neural networks
# Get statistics from training data
train_stats = train_dataset.describe().transpose()

def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

# Normalize both train and test datasets
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

# ============================================
# STEP 6: BUILD THE MODEL
# ============================================

def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)

    model.compile(
        loss='mse',
        optimizer=optimizer,
        metrics=['mae', 'mse']
    )

    return model

model = build_model()

# Display model architecture
print("\nModel Summary:")
model.summary()

# ============================================
# STEP 7: TRAIN THE MODEL
# ============================================

# Early stopping to prevent overfitting
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)

# Train the model
EPOCHS = 1000

history = model.fit(
    normed_train_data,
    train_labels,
    epochs=EPOCHS,
    validation_split=0.2,
    verbose=0,
    callbacks=[early_stop, tfdocs.modeling.EpochDots()]
)

print("\n\nTraining complete!")

# ============================================
# STEP 8: VISUALIZE TRAINING HISTORY
# ============================================

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch

print("\nTraining history (last 5 epochs):")
print(hist.tail())

def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error [expenses]')
    plt.plot(hist['epoch'], hist['mae'], label='Train Error')
    plt.plot(hist['epoch'], hist['val_mae'], label='Val Error')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.xlabel('Epoch')
    plt.ylabel('Mean Square Error [expenses^2]')
    plt.plot(hist['epoch'], hist['mse'], label='Train Error')
    plt.plot(hist['epoch'], hist['val_mse'], label='Val Error')
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_history(history)

# ============================================
# STEP 9: PREPARE FOR EVALUATION
# ============================================

# Update test_dataset to use normalized data for evaluation
test_dataset = normed_test_data

print("\n" + "="*50)
print("MODEL READY FOR EVALUATION")
print("="*50)
print("\nNow run the test cell (cell 3) to evaluate your model!")

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
