In [30]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

In [31]:
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [32]:
output_dir = os.path.join('..', 'eda_plots')
os.makedirs(output_dir, exist_ok=True)

In [33]:
df = pd.read_csv(r'C:\Users\bless\OneDrive\Desktop\week-3-final\acis-insurance-analytics\data\MachineLearningRating_v3.txt', sep='|')

  df = pd.read_csv(r'C:\Users\bless\OneDrive\Desktop\week-3-final\acis-insurance-analytics\data\MachineLearningRating_v3.txt', sep='|')


In [34]:
if 'TransactionMonth' in df.columns:
    df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'], errors='coerce', format='%Y-%m-%d %H:%M:%S')

In [35]:
print("Descriptive Statistics:")
print(df[['TotalPremium', 'TotalClaims', 'SumInsured', 'CustomValueEstimate']].describe())

Descriptive Statistics:
       TotalPremium   TotalClaims    SumInsured  CustomValueEstimate
count  1.000098e+06  1.000098e+06  1.000098e+06         2.204560e+05
mean   6.190550e+01  6.486119e+01  6.041727e+05         2.255311e+05
std    2.302845e+02  2.384075e+03  1.508332e+06         5.645157e+05
min   -7.825768e+02 -1.200241e+04  1.000000e-02         2.000000e+04
25%    0.000000e+00  0.000000e+00  5.000000e+03         1.350000e+05
50%    2.178333e+00  0.000000e+00  7.500000e+03         2.200000e+05
75%    2.192982e+01  0.000000e+00  2.500000e+05         2.800000e+05
max    6.528260e+04  3.930921e+05  1.263620e+07         2.655000e+07


In [36]:
print("\nData Types:")
print(df.dtypes)


Data Types:
UnderwrittenCoverID                  int64
PolicyID                             int64
TransactionMonth            datetime64[ns]
IsVATRegistered                       bool
Citizenship                         object
LegalType                           object
Title                               object
Language                            object
Bank                                object
AccountType                         object
MaritalStatus                       object
Gender                              object
Country                             object
Province                            object
PostalCode                           int64
MainCrestaZone                      object
SubCrestaZone                       object
ItemType                            object
mmcode                             float64
VehicleType                         object
RegistrationYear                     int64
make                                object
Model                               objec

In [37]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                         145961
AccountType                   40232
MaritalStatus                  8259
Gender                         9536
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                          552
VehicleType                     552
RegistrationYear                  0
make                            552
Model                           552
Cylinders                       552
cubiccapacity                   552
kilowatts                       552
bodytype                        552
NumberOfDoo

In [38]:
df['TotalPremium'].fillna(df['TotalPremium'].median(), inplace=True)
df['TotalClaims'].fillna(0, inplace=True) 
if 'Gender' in df.columns:
    df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
if 'Province' in df.columns:
    df['Province'].fillna(df['Province'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalPremium'].fillna(df['TotalPremium'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalClaims'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

In [39]:
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium'].replace(0, np.nan)

In [40]:
sns.histplot(df['TotalClaims'], bins=50, kde=True)
plt.title('Distribution of Total Claims')
plt.xlabel('Total Claims (Rand)')
plt.savefig(os.path.join(output_dir, 'total_claims_histogram.png'))
plt.close()

In [41]:
if 'Province' in df.columns:
    sns.countplot(x='Province', data=df, order=df['Province'].value_counts().index)
    plt.title('Policy Count by Province')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(output_dir,'province_bar.png'))
    plt.close()


In [42]:
sns.boxplot(x='TotalClaims', data=df)
plt.title('Box Plot of Total Claims')
plt.savefig(os.path.join(output_dir,'total_claims_boxplot.png'))
plt.close()

In [43]:
if 'VehicleType' in df.columns:
    sns.scatterplot(x='TotalPremium', y='TotalClaims', hue='VehicleType', size='LossRatio', data=df)
    plt.title('Total Premium vs. Total Claims by Vehicle Type')
    plt.savefig(os.path.join(output_dir,'premium_vs_claims_scatter.png'))
    plt.close()

In [44]:
numerical_cols = ['TotalPremium', 'TotalClaims', 'SumInsured', 'CustomValueEstimate']
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.savefig(os.path.join(output_dir,'correlation_matrix.png'))
plt.close()

In [45]:
if 'Province' in df.columns:
    sns.barplot(x='Province', y='LossRatio', data=df, ci=None, palette='viridis')
    plt.title('Average Loss Ratio by Province')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(output_dir,'loss_ratio_province.png'))
    plt.close()


The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(x='Province', y='LossRatio', data=df, ci=None, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Province', y='LossRatio', data=df, ci=None, palette='viridis')


In [46]:
sns.histplot(df['TotalClaims'], bins=50, kde=True, color='teal')
plt.axvline(df['TotalClaims'].quantile(0.95), color='red', linestyle='--', label='95th Percentile')
plt.title('Total Claims Distribution with Outlier Threshold')
plt.xlabel('Total Claims (Rand)')
plt.legend()
plt.savefig(os.path.join(output_dir,'total_claims_with_outliers.png'))
plt.close()

In [47]:
if 'Gender' in df.columns:
    sns.scatterplot(x='TotalPremium', y='TotalClaims', hue='Gender', style='Gender', data=df, s=100)
    plt.title('Total Premium vs. Total Claims by Gender')
    plt.xlabel('Total Premium (Rand)')
    plt.ylabel('Total Claims (Rand)')
    plt.savefig(os.path.join(output_dir,'premium_vs_claims_gender.png'))
    plt.close()

In [48]:
if 'Province' in df.columns:
    premium_by_province = df.groupby('Province')['TotalPremium'].mean().reset_index()
    sns.barplot(x='Province', y='TotalPremium', data=premium_by_province)
    plt.title('Average Total Premium by Province')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(output_dir,'premium_by_province.png'))
    plt.close()

print("✅ EDA completed. Visualizations saved as PNG files.")

✅ EDA completed. Visualizations saved as PNG files.
