<a href="https://colab.research.google.com/github/Roseewatches/10-Day-Python-Course/blob/main/Copy_of_Insurancepremium1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Insurance Premium Prediction Pipeline

# A. Data Understanding and Preprocessing

# 1. Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. Load Dataset
df = pd.read_csv('Insurance Premium Prediction Dataset.csv')
print("Initial Data Info:")
print(df.info())
print(df.head())

# 3. Handle Missing Values
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Fill numeric with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical with mode
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# 4. Correct Data Types & Format Text Fields
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype(str).str.strip().str.lower())

# 5. Address Skewed Distribution for Numerical Features
for col in num_cols:
    if abs(df[col].skew()) > 1:
        df[col] = np.log1p(df[col])  # log transform

# B. Exploratory Data Analysis (EDA)

# Univariate Analysis
for col in num_cols:
    plt.figure()
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

# Bivariate Analysis
plt.figure(figsize=(10,6))
sns.scatterplot(x='annual income', y='premium amount', data=df)
plt.title("Annual Income vs Premium Amount")
plt.show()

# Multivariate Analysis - Correlation Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=False, cmap='viridis')
plt.title("Correlation Heatmap")
plt.show()

# Identify correlations with Premium Amount
corr_target = df.corr()['premium amount'].sort_values(ascending=False)
print("Top Correlated Features with Premium Amount:\n", corr_target)

# C. Feature Engineering

# Encode Categorical Variables
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Generate new feature: Years Since Policy Start Date
df['policy start date'] = pd.to_datetime(df['policy start date'])
df['years since policy start'] = 2025 - df['policy start date'].dt.year

# Drop original date field after extraction
df = df.drop('policy start date', axis=1)

# D. Model Development

# Split data
X = df.drop('premium amount', axis=1)
y = df['premium amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Experiment with Multiple Regression Algorithms
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n✅ {name} Performance:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score:", r2_score(y_test, y_pred))

# E. Model Tuning and Optimization

# Example: Random Forest Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

print("\n✅ Tuned Random Forest Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred_best))
print("MSE:", mean_squared_error(y_test, y_pred_best))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_best)))
print("R2 Score:", r2_score(y_test, y_pred_best))
print("Best Parameters:", grid_search.best_params_)

# F. Interpretation and Insights

# Feature Importance
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10,6))
sns.barplot(x=importances[indices], y=[X.columns[i] for i in indices], palette='cool')
plt.title('Feature Importance - Tuned Random Forest')
plt.show()

# Actionable Insights
print("\n✅ Actionable Insights:")
print("1. Top features driving premiums include Annual Income, Credit Score, and Health Score.")
print("2. Improving customer Credit Scores can reduce risk-adjusted premiums.")
print("3. Policies older than certain years show trends of higher/lower premiums (explore for strategic pricing).")
print("4. Non-smoking and physically active customers correlate with lower premium risks.")

# Script End