# Create Human-Readable Summaries

## 1. Load Cleaned Data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/processed/cleaned_data.csv')
print(f"Loaded {len(df)} policies")
df.head()

Loaded 58592 policies


Unnamed: 0,policy_id,subscription_length,vehicle_age,customer_age,region_code,region_density,segment,model,fuel_type,max_torque,...,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,claim_status,safety_score,age_risk,vehicle_age_category
0,POL045360,9.3,1.2,41,C8,8794,C2,M4,Diesel,250Nm@2750rpm,...,1,1,0,1,1,3,0,11,mature,new
1,POL016745,8.2,1.8,35,C2,27003,C1,M9,Diesel,200Nm@1750rpm,...,1,1,1,1,1,4,0,6,middle,new
2,POL007194,9.5,0.2,44,C8,8794,C2,M4,Diesel,250Nm@2750rpm,...,1,1,0,1,1,3,0,11,mature,new
3,POL018146,5.2,0.4,44,C10,73430,A,M1,CNG,60Nm@3500rpm,...,1,0,0,0,1,0,0,2,mature,new
4,POL049011,10.1,1.0,56,C13,5410,B2,M5,Diesel,200Nm@3000rpm,...,1,0,0,1,1,5,0,7,mature,new


## 2. Design Summary Template

In [2]:
def create_policy_summary(row):
    """Convert structured data row into natural language"""
    
    # Safety features
    safety_features = []
    if row['is_esc'] == 1:
        safety_features.append('ESC')
    if row['is_brake_assist'] == 1:
        safety_features.append('brake assist')
    if row['is_parking_sensors'] == 1:
        safety_features.append('parking sensors')
    
    safety_text = ', '.join(safety_features) if safety_features else 'basic safety'
    
    # Build the summary
    summary = (
        f"A {row['customer_age']}-year-old driver in region {row['region_code']} "
        f"with a {row['vehicle_age']}-year-old {row['fuel_type']} {row['model']}. "
        f"Vehicle has {row['airbags']} airbags and {safety_text}. "
        f"NCAP rating: {row['ncap_rating']} stars. "
        f"Policy: {row['subscription_length']} months. "
        f"Claim filed: {'Yes' if row['claim_status'] == 1 else 'No'}."
    )
    
    return summary

# Test on one row
sample_summary = create_policy_summary(df.iloc[0])
print("Sample summary:")
print(sample_summary)

Sample summary:
A 41-year-old driver in region C8 with a 1.2-year-old Diesel M4. Vehicle has 6 airbags and ESC, brake assist, parking sensors. NCAP rating: 3 stars. Policy: 9.3 months. Claim filed: No.


## 3. Generate All Summaries

In [7]:
# Apply to all rows (this might take a minute)
print("Generating summaries...")
df['summary'] = df.apply(create_policy_summary, axis=1)

print(f"✓ Created {len(df)} summaries")
print("\nFirst 3 summaries:")
for i, summary in enumerate(df['summary'].head(3)):
    print(f"\n{i+1}. {summary}")

Generating summaries...
✓ Created 58592 summaries

First 3 summaries:

1. A 41-year-old driver in region C8 with a 1.2-year-old Diesel M4. Vehicle has 6 airbags and ESC, brake assist, parking sensors. NCAP rating: 3 stars. Policy: 9.3 months. Claim filed: No.

2. A 35-year-old driver in region C2 with a 1.8-year-old Diesel M9. Vehicle has 2 airbags and parking sensors. NCAP rating: 4 stars. Policy: 8.2 months. Claim filed: No.

3. A 44-year-old driver in region C8 with a 0.2-year-old Diesel M4. Vehicle has 6 airbags and ESC, brake assist, parking sensors. NCAP rating: 3 stars. Policy: 9.5 months. Claim filed: No.


In [1]:
# Filter only rows where a claim was filed
claims_yes = df[df['claim_status'] == 1]


# Show top few with summaries
print("✅ Claims Filed (Sample):")
display(claims_yes[['policy_id', 'customer_age', 'vehicle_age', 'model', 'fuel_type', 'ncap_rating', 'summary']].head(5))


NameError: name 'df' is not defined

In [6]:
# Total number of claims filed
total_claims = df['claim_status'].sum()
print(f"Total claims filed: {total_claims}")

# Percentage of claims filed
claim_percent = (total_claims / len(df)) * 100
print(f"Percentage of policies with claims: {claim_percent:.2f}%")


Total claims filed: 3748
Percentage of policies with claims: 6.40%


## 4. Quality Check

In [8]:
# Check summary lengths
df['summary_length'] = df['summary'].str.len()

print("Summary statistics:")
print(df['summary_length'].describe())

print("\nLongest summary:")
print(df.loc[df['summary_length'].idxmax(), 'summary'])

Summary statistics:
count    58592.000000
mean       174.777171
std          9.989008
min        163.000000
25%        164.000000
50%        180.000000
75%        185.000000
max        189.000000
Name: summary_length, dtype: float64

Longest summary:
A 49-year-old driver in region C12 with a 1.6-year-old Petrol M11. Vehicle has 2 airbags and ESC, brake assist, parking sensors. NCAP rating: 5 stars. Policy: 12.0 months. Claim filed: Yes.


## 5. Save Enhanced Dataset

In [9]:
# Save with summaries
output_path = '../data/processed/data_with_summaries.csv'
df.to_csv(output_path, index=False)

print(f"✓ Saved to {output_path}")
print(f"Columns: {df.columns.tolist()}")

✓ Saved to ../data/processed/data_with_summaries.csv
Columns: ['policy_id', 'subscription_length', 'vehicle_age', 'customer_age', 'region_code', 'region_density', 'segment', 'model', 'fuel_type', 'max_torque', 'max_power', 'engine_type', 'airbags', 'is_esc', 'is_adjustable_steering', 'is_tpms', 'is_parking_sensors', 'is_parking_camera', 'rear_brakes_type', 'displacement', 'cylinder', 'transmission_type', 'steering_type', 'turning_radius', 'length', 'width', 'gross_weight', 'is_front_fog_lights', 'is_rear_window_wiper', 'is_rear_window_washer', 'is_rear_window_defogger', 'is_brake_assist', 'is_power_door_locks', 'is_central_locking', 'is_power_steering', 'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror', 'is_ecw', 'is_speed_alert', 'ncap_rating', 'claim_status', 'safety_score', 'age_risk', 'vehicle_age_category', 'summary', 'summary_length']
