In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
df= pd.read_csv('/content/insurance.csv')

In [3]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
52,18,116.7,1.57,2.96,False,Jalandhar,student,Medium
61,32,102.4,1.68,24.05,True,Chandigarh,unemployed,High
87,30,82.0,1.6,25.59837,False,Hyderabad,government_job,Low
86,35,66.0,1.89,37.38,False,Hyderabad,freelancer,Low
84,75,86.2,1.73,0.62,True,Jaipur,retired,High


In [4]:
df_feat= df.copy()

In [5]:
df_feat["bmi"]= df_feat["weight"]/ (df_feat["height"]**2)

In [6]:
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"


In [7]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [8]:
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [9]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [10]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [11]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [12]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [13]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
23,23.71,unemployed,22.187855,adult,low,2,Medium
71,20.25,unemployed,16.513537,adult,low,2,Low
22,30.0,government_job,31.771627,middle_aged,medium,2,Low
62,35.67,business_owner,21.738481,adult,low,1,Low
11,10.899387,government_job,31.722551,adult,high,1,Low


In [14]:
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [15]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [16]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [17]:
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [19]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [21]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [25]:
pip install gradio



In [26]:
import gradio as gr

In [30]:
def predict_premium_category(age, weight, height, income_lpa, smoker, city, occupation):

    bmi = weight / (height**2)

    age_grp = age_group(age)
    lifestyle_rs = lifestyle_risk(pd.Series({"smoker": smoker, "bmi": bmi}))
    city_tr = city_tier(city)

    input_df = pd.DataFrame([
        {
            "bmi": bmi,
            "age_group": age_grp,
            "lifestyle_risk": lifestyle_rs,
            "city_tier": city_tr,
            "income_lpa": income_lpa,
            "occupation": occupation
        }
    ])


    prediction = pipeline.predict(input_df)[0]
    return prediction

In [31]:
inputs = [
    gr.Number(label="Age"),
    gr.Number(label="Weight (kg)"),
    gr.Number(label="Height (m)"),
    gr.Number(label="Annual Income (LPA)"),
    gr.Checkbox(label="Are you a smoker?"),
    gr.Dropdown(choices=df['city'].unique().tolist(), label="City"), # Use df['city'] for all possible cities
    gr.Dropdown(choices=X['occupation'].unique().tolist(), label="Occupation")
]

# Define output component
outputs = gr.Textbox(label="Predicted Insurance Premium Category")

# Create the Gradio interface
iface = gr.Interface(fn=predict_premium_category,
                     inputs=inputs,
                     outputs=outputs,
                     title="Insurance Premium Category Predictor",
                     description="Enter your details below:")

In [32]:
iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://aa53aa15ed3618c88f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [23]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)