Import Dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 


In [3]:
df=pd.read_csv(r"F:\ugp\home_insurance_premiums.csv")

In [4]:
df 

Unnamed: 0,policy_id,home_value,home_age,num_floors,area_sqft,location_risk_zone,has_security_system,construction_type,prior_claims,deductible_amount,premium
0,1,171958,5,1,4830,Low,0,Wood,1,4215,5961.39
1,2,721155,94,2,1868,Low,1,Wood,1,8825,3392.41
2,3,181932,94,2,3081,High,0,Brick,0,1390,6962.26
3,4,415838,57,1,4368,Medium,1,Concrete,0,4037,6509.29
4,5,309178,17,2,4037,Low,1,Wood,3,1232,6269.64
...,...,...,...,...,...,...,...,...,...,...,...
995,996,362066,91,3,2046,Low,0,Concrete,0,3205,2751.24
996,997,668362,64,3,4063,Medium,0,Wood,2,4955,9331.23
997,998,737409,17,3,2105,Low,1,Brick,1,7805,3018.38
998,999,611419,12,2,4252,Low,1,Concrete,1,1404,7101.16


Step 1 Read data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   policy_id            1000 non-null   int64  
 1   home_value           1000 non-null   int64  
 2   home_age             1000 non-null   int64  
 3   num_floors           1000 non-null   int64  
 4   area_sqft            1000 non-null   int64  
 5   location_risk_zone   1000 non-null   object 
 6   has_security_system  1000 non-null   int64  
 7   construction_type    1000 non-null   object 
 8   prior_claims         1000 non-null   int64  
 9   deductible_amount    1000 non-null   int64  
 10  premium              1000 non-null   float64
dtypes: float64(1), int64(8), object(2)
memory usage: 86.1+ KB


In [6]:
df.shape

(1000, 11)

In [7]:
df.head()


Unnamed: 0,policy_id,home_value,home_age,num_floors,area_sqft,location_risk_zone,has_security_system,construction_type,prior_claims,deductible_amount,premium
0,1,171958,5,1,4830,Low,0,Wood,1,4215,5961.39
1,2,721155,94,2,1868,Low,1,Wood,1,8825,3392.41
2,3,181932,94,2,3081,High,0,Brick,0,1390,6962.26
3,4,415838,57,1,4368,Medium,1,Concrete,0,4037,6509.29
4,5,309178,17,2,4037,Low,1,Wood,3,1232,6269.64


In [8]:
df.columns

Index(['policy_id', 'home_value', 'home_age', 'num_floors', 'area_sqft',
       'location_risk_zone', 'has_security_system', 'construction_type',
       'prior_claims', 'deductible_amount', 'premium'],
      dtype='object')

Step 2: Understand column types

Numeric variables (like home_value, area_sqft) affect premiums quantitatively.
Categorical variables (like construction_type, location_risk_zone) affect premiums qualitatively.

In [9]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)


Numeric columns: ['policy_id', 'home_value', 'home_age', 'num_floors', 'area_sqft', 'has_security_system', 'prior_claims', 'deductible_amount', 'premium']
Categorical columns: ['location_risk_zone', 'construction_type']


Step 4: Clean the data

In [11]:
print("Duplicates before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicates after:", df.duplicated().sum())

Duplicates before: 0
Duplicates after: 0


In [13]:
df.isnull().sum().sort_values(ascending=True)

policy_id              0
home_value             0
home_age               0
num_floors             0
area_sqft              0
location_risk_zone     0
has_security_system    0
construction_type      0
prior_claims           0
deductible_amount      0
premium                0
dtype: int64

In [14]:
df["location_risk_zone"] = df["location_risk_zone"].str.strip().str.title()
df["construction_type"] = df["construction_type"].replace({
    "wood": "Wood", "WOOD": "Wood", "concrete": "Concrete"
})


Step 5: Understand distributions

In [15]:
print("\nSummary statistics for numeric columns:")
print(df[numeric_cols].describe())



Summary statistics for numeric columns:
         policy_id     home_value     home_age   num_floors    area_sqft  \
count  1000.000000    1000.000000  1000.000000  1000.000000  1000.000000   
mean    500.500000  505358.874000    49.899000     1.966000  2747.778000   
std     288.819436  274092.875806    29.017147     0.804669  1242.815311   
min       1.000000   50404.000000     1.000000     1.000000   506.000000   
25%     250.750000  257162.000000    25.000000     1.000000  1683.250000   
50%     500.500000  501142.000000    50.000000     2.000000  2723.000000   
75%     750.250000  742459.250000    74.000000     3.000000  3832.250000   
max    1000.000000  999597.000000    99.000000     3.000000  4999.000000   

       has_security_system  prior_claims  deductible_amount       premium  
count          1000.000000   1000.000000        1000.000000   1000.000000  
mean              0.564000      1.058000        5475.684000   5661.350740  
std               0.496135      1.014724      

In [16]:
print("\nSummary statistics for numeric columns:")
print(df[categorical_cols].describe())



Summary statistics for numeric columns:
       location_risk_zone construction_type
count                1000              1000
unique                  3                 3
top                   Low          Concrete
freq                  477               511
