In [5]:
import pandas as pd

# Example load with delimiter specified and error handling
try:
    df = pd.read_csv("../data/MachineLearningRating_v3.txt", delimiter='\t')
except Exception as e:
    print("Error loading file:", e)
    try:
        df = pd.read_csv("../data/MachineLearningRating_v3.txt", delimiter=',')
    except Exception as e2:
        print("Fallback load also failed:", e2)
        df = None

if df is not None:
    # Split the single column into multiple columns
    df = df[df.columns[0]].str.split('|', expand=True)
    # Set column names from the header row
    header = [
        'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims'
    ]
    df.columns = header
    # Remove header row if present in data
    if df.iloc[0].equals(pd.Series(header)):
        df = df.iloc[1:].reset_index(drop=True)
    # Convert numeric columns
    for col in ['TotalClaims', 'TotalPremium']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    # Preprocessing
    if 'TotalClaims' in df.columns and 'TotalPremium' in df.columns:
        df['ClaimFrequency'] = df['TotalClaims'] > 0
        # If TotalClaimAmount exists, use it; otherwise, set ClaimSeverity to NaN
        if 'TotalClaimAmount' in df.columns:
            df['TotalClaimAmount'] = pd.to_numeric(df['TotalClaimAmount'], errors='coerce')
            df['ClaimSeverity'] = df['TotalClaimAmount'] / df['TotalClaims']
            df['Margin'] = df['TotalPremium'] - df['TotalClaimAmount']
        else:
            df['ClaimSeverity'] = float('nan')
            df['Margin'] = float('nan')
    else:
        print("Required columns not found after splitting.")

In [6]:
# Example: postal code comparison
group_a = df[df['PostalCode'] == '12345']
group_b = df[df['PostalCode'] == '54321']

In [8]:
from scipy.stats import ttest_ind

# Compare TotalPremium between the two postal code groups
ttest_ind(group_a['TotalPremium'], group_b['TotalPremium'])

  return f(*args, **kwargs)


TtestResult(statistic=np.float64(nan), pvalue=np.float64(nan), df=np.float64(nan))

In [9]:
import scipy.stats as stats

# Kruskal-Wallis (non-parametric ANOVA)
groups = [df[df['Province'] == p]['ClaimFrequency'] for p in df['Province'].unique()]
stat, p_value = stats.kruskal(*groups)


In [10]:
stat, p_value = ttest_ind(group_a['Margin'], group_b['Margin'])


In [11]:
# Chi-squared
contingency_table = pd.crosstab(df['Gender'], df['ClaimFrequency'])
chi2, p, _, _ = stats.chi2_contingency(contingency_table)
