In [2]:
import pandas as pd


In [3]:
# Load the combined data from parquet file
combined_df = pd.read_parquet('combined_data.parquet')


In [4]:
avg_cars_per_household = combined_df.groupby('HH_ID')['CAR_ID'].nunique().mean()
print("Average number of cars per household:", avg_cars_per_household)


Average number of cars per household: 3.8065579511541507


In [5]:
cars_by_model_year = combined_df['Model Year'].value_counts()
print("Number of cars by model year:\n", cars_by_model_year)


Number of cars by model year:
 Model Year
2011    27932
2004    27902
2009    27851
2017    27835
2016    27771
        ...  
1992     4148
1966     4130
1963     4111
1980     4100
1974     4095
Name: count, Length: 73, dtype: int64


In [6]:
cars_by_make = combined_df['Make'].value_counts()
print("Number of cars by make:\n", cars_by_make)


Number of cars by make:
 Make
Manufacturer2    169311
Manufacturer1    168954
Manufacturer3    168834
Manufacturer6    110571
Manufacturer7    110503
Manufacturer4     58928
Manufacturer5     58704
Name: count, dtype: int64


In [7]:
# Assuming safety is defined by the 'Driver Safety Discount' and 'Vehicle Safety Discount'
safest_cars = combined_df[(combined_df['Driver Safety Discount'] == True) & 
                          (combined_df['Vehicle Safety Discount'] == True)]
print("Safest cars count:", safest_cars.shape[0])


Safest cars count: 71918


In [9]:
# Check the column names in the combined DataFrame
print(combined_df.columns)


Index(['CUST_ID', 'Date of Birth', 'Marital Status', 'Employment Type',
       'Income', 'HH_ID', 'CAR_ID', 'Active HH', 'HH Start Date',
       'Phone Number', 'ZIP ', 'State_x', 'Country', 'Referral Source',
       'Car ID', 'Status', 'State_y', 'Model Year', 'Make', 'Body Style',
       'Vehicle Value', 'Annual Miles Driven', 'Business Use',
       'Antique Vehicle', 'Lien', 'Lease', 'Driver Safety Discount',
       'Vehicle Safety Discount', 'Claim Payout', '6 Month Premium Amount'],
      dtype='object')


In [10]:
# Count the number of unique customers (CUST_ID) by state and sort
largest_households = combined_df.groupby('State_x')['CUST_ID'].nunique().sort_values(ascending=False)
print("States with the largest households:\n", largest_households)


States with the largest households:
 State_x
KY    10519
SC    10453
AK    10401
HI    10356
WY    10327
CT    10326
NE    10295
MI    10272
MA    10266
AR    10207
MN    10204
GA    10185
WA    10184
AL    10175
NY    10154
NV    10148
CA    10110
MS    10061
NJ    10058
MO    10044
NM    10040
IN    10030
MT    10029
AZ     9988
UT     9986
FL     9963
SD     9961
OH     9945
RI     9936
ND     9921
IL     9920
MD     9911
OR     9908
TX     9901
NC     9878
IA     9855
WV     9849
NH     9809
VA     9808
LA     9802
KS     9775
ME     9748
OK     9740
DE     9712
VT     9699
WI     9688
PA     9645
TN     9633
CO     9590
ID     9583
Name: CUST_ID, dtype: int64


In [11]:
# Count active households as of January 1, 2021
active_households = combined_df[combined_df['HH Start Date'] <= '2021-01-01']['HH_ID'].nunique()
print("Number of active households as of 1/1/2021:", active_households)


Number of active households as of 1/1/2021: 54344


In [12]:
# Calculate the average age of customers
combined_df['Age'] = (pd.Timestamp('now') - combined_df['Date of Birth']).dt.days // 365
avg_age = combined_df['Age'].mean()
print("Average age of customers:", avg_age)


Average age of customers: 6.673170529850261


In [15]:
# Calculate age variation by state
age_variation_by_state = combined_df.groupby('State_x')['Age'].std()
print("Age variation by state:\n", age_variation_by_state)


Age variation by state:
 State_x
AK    29.412447
AL    29.528914
AR    29.606095
AZ    29.485062
CA    29.204428
CO    29.837777
CT    29.303955
DE    29.397503
FL    29.324556
GA    29.306956
HI    29.473650
IA    29.469966
ID    29.576723
IL    29.631817
IN    29.540904
KS    29.568169
KY    29.272318
LA    29.406784
MA    29.357136
MD    29.415131
ME    29.441578
MI    29.628494
MN    29.425995
MO    29.238599
MS    29.736648
MT    29.495037
NC    29.388942
ND    29.357092
NE    29.365754
NH    29.419523
NJ    29.421301
NM    29.512413
NV    29.511876
NY    29.421128
OH    29.439442
OK    29.339468
OR    29.369126
PA    29.283669
RI    29.601361
SC    29.383838
SD    29.303073
TN    29.466235
TX    29.281396
UT    29.656893
VA    29.585059
VT    29.588658
WA    29.171349
WI    29.512184
WV    29.223402
WY    29.245575
Name: Age, dtype: float64


In [16]:
# Define age groups and calculate the most expensive claims by age group
age_groups = pd.cut(combined_df['Age'], bins=[0, 18, 35, 50, 65, 100], 
                    labels=['0-18', '19-35', '36-50', '51-65', '66-100'])
most_expensive_claims = combined_df.groupby(age_groups)['Claim Payout'].sum().sort_values(ascending=False)
print("Age group with the most expensive claims:\n", most_expensive_claims)


Age group with the most expensive claims:
 Age
19-35     314048598
36-50     195039309
0-18       62944899
51-65             0
66-100            0
Name: Claim Payout, dtype: int64


In [None]:
# Data Analysis

This notebook contains the analysis of the combined dataset created from the three input CSV files. 

### Average Number of Cars per Household
The average number of cars per household was calculated to assess vehicle ownership distribution.

### Count of Cars by Model Year
This analysis provides insights into the age distribution of cars within the dataset.

### Count of Cars by Make
Understanding the distribution of car makes can inform inventory and marketing strategies.

### Safest Cars
We identified the safest cars based on specific discount criteria, indicating their safety ratings.

### States with Largest Households
This analysis reveals geographical trends in household sizes.

### Active Households as of 1/1/2021
Understanding the count of active households helps in assessing market stability.

### Average Age of Customers
The average age calculation provides demographic insights into the customer base.

### Age Variation by Region
This metric helps in understanding the demographic diversity across regions.

### Age Group with the Most Expensive Claims
This analysis identifies which age group incurs the highest claims, useful for risk assessment.

