<a href="https://colab.research.google.com/github/Thiru-gv/Medical-Insurance-Price-Prediction/blob/main/Another_copy_of_insurance_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-11-jdk
!pip install pyspark
!pip install numpy pandas matplotlib ipywidgets



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre
  x11-utils
Suggested packages:
  libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk
  openjdk-11-jre x11-utils
0 upgraded, 10 newly installed, 0 to remove and 35 not upgraded.
Need to get 6,920 kB of archives.
After this operation, 16.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-extra all 2.37-2build1 [2,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu jam

In [None]:
import os
import sys
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import functions as F
import ipywidgets as widgets
from IPython.display import display

# Set up Spark session
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
spark = SparkSession.builder \
    .appName("Medical Insurance Price Prediction") \
    .getOrCreate()


In [None]:
from google.colab import files

# Upload a CSV file
uploaded = files.upload()

# Load the data into a Spark DataFrame
data = spark.read.csv(list(uploaded.keys())[0], header=True, inferSchema=True)

# Display the DataFrame
data.show()


Saving Medical_insurance.csv to Medical_insurance.csv
+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82

In [None]:
# Handle missing values if necessary
data = data.na.drop()

# Convert categorical variables (like 'smoker', 'sex', and 'region') to numerical values
data = data.withColumn("smoker", F.when(F.col("smoker") == "yes", 1).otherwise(0))
data = data.withColumn("sex", F.when(F.col("sex") == "male", 1).otherwise(0))  # Male=1, Female=0


In [None]:
# Assemble features
feature_columns = ["age", "bmi", "children", "smoker", "sex"]  # Add sex to features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

In [None]:
# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# Create a Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='charges')

# Fit the model
lr_model = lr.fit(train_data)

# Evaluate the model
test_results = lr_model.evaluate(test_data)

print(f"RMSE: {test_results.rootMeanSquaredError}")
print(f"R2: {test_results.r2}")

RMSE: 5711.166987546053
R2: 0.7650846342172626


In [None]:
# Define input widgets
age = widgets.IntSlider(value=30, min=18, max=100, description='Age:')
bmi = widgets.FloatSlider(value=25.0, min=10.0, max=50.0, step=0.1, description='BMI:')
children = widgets.IntSlider(value=0, min=0, max=10, description='Children:')
smoker = widgets.Dropdown(options=[('Non-Smoker', 0), ('Smoker', 1)], description='Smoker:')
sex = widgets.Dropdown(options=[('Male', 1), ('Female', 0)], description='Sex:')  # Male=1, Female=0

# Create a button to trigger prediction
predict_button = widgets.Button(description='Predict Charges')

# Define the prediction function
def predict_charges(b):
    input_data = pd.DataFrame({
        'age': [age.value],
        'bmi': [bmi.value],
        'children': [children.value],
        'smoker': [smoker.value],
        'sex': [sex.value]  # Include sex in the input data
    })

    input_spark_df = spark.createDataFrame(input_data)
    input_spark_df = assembler.transform(input_spark_df)  # Transform the input DataFrame

    # Get predictions using the model
    prediction_df = lr_model.transform(input_spark_df)

    # Extract the prediction
    prediction_value = prediction_df.select("prediction").first()[0]

    print(f"Predicted Charges: {prediction_value:.2f}")

# Link the button to the prediction function
predict_button.on_click(predict_charges)

# Display the widgets
display(age, bmi, children, smoker, sex, predict_button)

IntSlider(value=30, description='Age:', min=18)

FloatSlider(value=25.0, description='BMI:', max=50.0, min=10.0)

IntSlider(value=0, description='Children:', max=10)

Dropdown(description='Smoker:', options=(('Non-Smoker', 0), ('Smoker', 1)), value=0)

Dropdown(description='Sex:', options=(('Male', 1), ('Female', 0)), value=1)

Button(description='Predict Charges', style=ButtonStyle())

Predicted Charges: 6730.00
Predicted Charges: 6730.00
Predicted Charges: 6737.24
