In [29]:
import pandas as pd
import numpy as np

data = pd.read_csv('Data/clean_insurance_data.csv')
print(data.head())

   claim_id policy_start_date  claim_date       city policy_type  \
0  CL211801        2023-01-12  2024-06-24      Delhi      Travel   
1  CL196074        2024-10-01  2025-09-20  Ahmedabad      Travel   
2  CL280133        2023-08-22  2025-08-12  Ahmedabad      Travel   
3  CL019432        2024-09-17  2024-10-01    Kolkata        Home   
4  CL232246        2024-02-19  2024-10-01    Chennai        Home   

  sales_channel vehicle_type  customer_age  annual_premium  claim_amount  \
0       Partner      Unknown          36.0         17228.0       53497.0   
1         Agent      Unknown          47.0          8940.0       41607.0   
2           App      Unknown          18.0         10861.0       58176.0   
3         Agent      Unknown          36.0          3091.0       43317.0   
4         Agent      Unknown          38.0          6660.0       79835.0   

   past_claims_count  days_since_policy_start  documents_submitted  is_fraud  
0                0.0                    239.0          

In [30]:

print("Data Info")
print(data.info())

print()

print("Data Description")
print(data.describe())

print()

print("Data Shape")
print(data.shape)

print()

Data Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   claim_id                 2500 non-null   object 
 1   policy_start_date        2500 non-null   object 
 2   claim_date               2500 non-null   object 
 3   city                     2500 non-null   object 
 4   policy_type              2500 non-null   object 
 5   sales_channel            2500 non-null   object 
 6   vehicle_type             2500 non-null   object 
 7   customer_age             2500 non-null   float64
 8   annual_premium           2500 non-null   float64
 9   claim_amount             2500 non-null   float64
 10  past_claims_count        2500 non-null   float64
 11  days_since_policy_start  2500 non-null   float64
 12  documents_submitted      2500 non-null   float64
 13  is_fraud                 2500 non-null   int64  
dtypes: float64(6),

In [31]:
data['policy_start_date'] = pd.to_datetime(
    data['policy_start_date'],
    errors= 'coerce'
)

data['claim_date'] = pd.to_datetime(
    data['claim_date'],
    errors= 'coerce'
)

In [32]:
data['policy_tenure_days'] = (data['claim_date'] - data['policy_start_date']).dt.days


In [33]:
data["policy_tenure_days"] = data["policy_tenure_days"].astype(float)

In [34]:
print(data['policy_tenure_days'].describe())

count    2500.000000
mean      259.128800
std       191.291044
min         0.000000
25%       107.000000
50%       234.500000
75%       379.000000
max      1059.000000
Name: policy_tenure_days, dtype: float64


In [35]:
print(data['policy_tenure_days'].isnull().sum())

0


In [36]:
data["claim_ratio"] = np.where(
    data["annual_premium"] == 0,
    np.nan,
    data["claim_amount"] / data["annual_premium"]
)

In [37]:
print(data['claim_ratio'])

0        3.105236
1        4.654027
2        5.356413
3       14.013911
4       11.987237
          ...    
2495     5.646752
2496    19.822125
2497     1.781437
2498     9.780300
2499    12.500000
Name: claim_ratio, Length: 2500, dtype: float64


In [38]:
print(data['claim_ratio'].describe())

count    2472.000000
mean        8.055174
std        19.593267
min         0.000000
25%         2.763263
50%         5.150937
75%         8.424009
max       572.469697
Name: claim_ratio, dtype: float64


In [39]:
data["claim_ratio"].fillna(
    data["claim_ratio"].median(),
    inplace=True
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["claim_ratio"].fillna(


In [40]:
print(data['claim_ratio'].isnull().sum())

0


In [41]:
data['risk_score'] = (data['past_claims_count'] + data['days_since_policy_start'] + data['documents_submitted'])

In [42]:
print(data['risk_score'].describe())

count    2500.000000
mean      135.906400
std       111.848129
min         0.000000
25%        63.000000
50%       123.000000
75%       186.000000
max       817.000000
Name: risk_score, dtype: float64


In [40]:
print(data['claim_ratio'].isnull().sum())

0


In [43]:
print(data.head())

   claim_id policy_start_date claim_date       city policy_type sales_channel  \
0  CL211801        2023-01-12 2024-06-24      Delhi      Travel       Partner   
1  CL196074        2024-10-01 2025-09-20  Ahmedabad      Travel         Agent   
2  CL280133        2023-08-22 2025-08-12  Ahmedabad      Travel           App   
3  CL019432        2024-09-17 2024-10-01    Kolkata        Home         Agent   
4  CL232246        2024-02-19 2024-10-01    Chennai        Home         Agent   

  vehicle_type  customer_age  annual_premium  claim_amount  past_claims_count  \
0      Unknown          36.0         17228.0       53497.0                0.0   
1      Unknown          47.0          8940.0       41607.0                3.0   
2      Unknown          18.0         10861.0       58176.0                0.0   
3      Unknown          36.0          3091.0       43317.0                0.0   
4      Unknown          38.0          6660.0       79835.0                1.0   

   days_since_policy_start

In [44]:
data.to_csv("Data/Final_insurance_data.csv", index=False)