In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv(r"D:\prakash\Smart_Premium_New\playground-series-s4e12 (3)\train.csv")

In [3]:
# Check current dataset size
print(f"Original dataset size: {df_train.shape}")
print()

# Calculate premium to income ratio
df_train['Premium_to_Income_Ratio'] = (df_train['Premium Amount'] / df_train['Annual Income']) * 100

# Remove records where Premium > 50% of Annual Income
df_train_cleaned = df_train[df_train['Premium_to_Income_Ratio'] <= 50].copy()

# Drop the temporary ratio column
df_train_cleaned = df_train_cleaned.drop('Premium_to_Income_Ratio', axis=1)
df_train = df_train.drop('Premium_to_Income_Ratio', axis=1)

print(f"Cleaned dataset size: {df_train_cleaned.shape}")
print(f"Removed records: {len(df_train) - len(df_train_cleaned)}")
print()

# Update df_train to cleaned version
df_train = df_train_cleaned.copy()

print(f"Final dataset size: {df_train.shape}")

Original dataset size: (1200000, 21)

Cleaned dataset size: (1051855, 21)
Removed records: 148145

Final dataset size: (1051855, 21)


In [4]:
# Check missing values before
print("Missing values before:")
print(df_train.isnull().sum())
print()

# Numerical columns - fill with median
numerical_cols = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                  'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']

for col in numerical_cols:
    df_train[col].fillna(df_train[col].median(), inplace=True)

# Categorical columns - fill with mode (most frequent value)
categorical_cols = ['Marital Status', 'Occupation', 'Customer Feedback']

for col in categorical_cols:
    df_train[col].fillna(df_train[col].mode()[0], inplace=True)

# Check missing values after
print("Missing values after:")
print(df_train.isnull().sum())
print()

print("All missing values handled!")

Missing values before:
id                           0
Age                      16404
Gender                       0
Annual Income                0
Marital Status           15822
Number of Dependents     96384
Education Level              0
Occupation              315017
Health Score             59415
Location                     0
Policy Type                  0
Previous Claims         322859
Vehicle Age                  4
Credit Score            122455
Insurance Duration           1
Policy Start Date            0
Customer Feedback        65992
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

Missing values after:
id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
dtype: int64

All missing values handled!


In [5]:
# Delete id column
df_train = df_train.drop('id', axis=1)

# Convert Policy Start Date to datetime
df_train['Policy Start Date'] = pd.to_datetime(df_train['Policy Start Date'])

# Check the changes
print("Columns after dropping id:")
print(df_train.columns.tolist())
print()

print("Data types:")
print(df_train.dtypes)
print()

print(f"Final shape: {df_train.shape}")

Columns after dropping id:
['Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents', 'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type', 'Premium Amount']

Data types:
Age                            float64
Gender                          object
Annual Income                  float64
Marital Status                  object
Number of Dependents           float64
Education Level                 object
Occupation                      object
Health Score                   float64
Location                        object
Policy Type                     object
Previous Claims                float64
Vehicle Age                    float64
Credit Score                   float64
Insurance Duration             float64
Policy Start Date       datetime64[ns]
Customer Feedback               o

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1051855 entries, 0 to 1199997
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   Age                   1051855 non-null  float64       
 1   Gender                1051855 non-null  object        
 2   Annual Income         1051855 non-null  float64       
 3   Marital Status        1051855 non-null  object        
 4   Number of Dependents  1051855 non-null  float64       
 5   Education Level       1051855 non-null  object        
 6   Occupation            1051855 non-null  object        
 7   Health Score          1051855 non-null  float64       
 8   Location              1051855 non-null  object        
 9   Policy Type           1051855 non-null  object        
 10  Previous Claims       1051855 non-null  float64       
 11  Vehicle Age           1051855 non-null  float64       
 12  Credit Score          1051855 non-null  float64

In [7]:
df_train.isnull().sum()

Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
dtype: int64

In [8]:
df_train.to_csv('cleaned_df.csv', index=False)


In [2]:
import pandas as pd
df_train = pd.read_csv(r"D:\prakash\Smart_Premium_New\playground-series-s4e12 (3)\cleaned_df.csv")

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1051855 entries, 0 to 1051854
Data columns (total 20 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   1051855 non-null  float64
 1   Gender                1051855 non-null  object 
 2   Annual Income         1051855 non-null  float64
 3   Marital Status        1051855 non-null  object 
 4   Number of Dependents  1051855 non-null  float64
 5   Education Level       1051855 non-null  object 
 6   Occupation            1051855 non-null  object 
 7   Health Score          1051855 non-null  float64
 8   Location              1051855 non-null  object 
 9   Policy Type           1051855 non-null  object 
 10  Previous Claims       1051855 non-null  float64
 11  Vehicle Age           1051855 non-null  float64
 12  Credit Score          1051855 non-null  float64
 13  Insurance Duration    1051855 non-null  float64
 14  Policy Start Date     1051855 non-

In [6]:
print("="*60)
print("STEP 1: FEATURE ENGINEERING")
print("="*60)

# Extract datetime features from Policy Start Date
df_train['Policy Start Date'] = pd.to_datetime(df_train['Policy Start Date'])
df_train['Policy_Year'] = df_train['Policy Start Date'].dt.year
df_train['Policy_Month'] = df_train['Policy Start Date'].dt.month
df_train['Policy_Day'] = df_train['Policy Start Date'].dt.day
df_train = df_train.drop('Policy Start Date', axis=1)

# Create binned features
df_train['Age_Group'] = pd.cut(df_train['Age'], bins=[0, 25, 35, 45, 55, 100], 
                                labels=['18-25', '26-35', '36-45', '46-55', '55+'])
df_train['Income_Group'] = pd.cut(df_train['Annual Income'], bins=[0, 30000, 60000, 100000, 200000], 
                                   labels=['Low', 'Medium', 'High', 'Very High'])
df_train['Vehicle_Category'] = pd.cut(df_train['Vehicle Age'], bins=[-1, 3, 7, 15, 20], 
                                       labels=['New', 'Mid', 'Old', 'Very Old'])
df_train['Credit_Category'] = pd.cut(df_train['Credit Score'], bins=[0, 500, 650, 750, 850], 
                                      labels=['Poor', 'Fair', 'Good', 'Excellent'])

# Create derived features
df_train['Risk_Score'] = (100 - df_train['Health Score']) + (df_train['Previous Claims'] * 10)
df_train['Income_Per_Dependent'] = df_train['Annual Income'] / (df_train['Number of Dependents'] + 1)

# Interaction features
df_train['Age_Income'] = df_train['Age'] * df_train['Annual Income']
df_train['Age_Health'] = df_train['Age'] * df_train['Health Score']
df_train['Credit_Income'] = df_train['Credit Score'] * df_train['Annual Income']
df_train['Health_Claims'] = df_train['Health Score'] * df_train['Previous Claims']
df_train['Risk_Income'] = df_train['Risk_Score'] * df_train['Annual Income']

# Polynomial features
df_train['Age_Squared'] = df_train['Age'] ** 2
df_train['Income_Squared'] = df_train['Annual Income'] ** 2
df_train['Health_Squared'] = df_train['Health Score'] ** 2
df_train['Credit_Squared'] = df_train['Credit Score'] ** 2

# Log transformations
df_train['Log_Income'] = np.log1p(df_train['Annual Income'])
df_train['Log_Income_Per_Dep'] = np.log1p(df_train['Income_Per_Dependent'])

print(f"Features after engineering: {df_train.shape[1]}")
print()

STEP 1: FEATURE ENGINEERING
Features after engineering: 40



In [7]:
df_train.shape

(1051855, 40)

In [9]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
print("="*60)
print("STEP 2: ENCODING CATEGORICAL VARIABLES")
print("="*60)

categorical_cols = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 
                    'Location', 'Policy Type', 'Smoking Status', 'Exercise Frequency', 
                    'Property Type', 'Customer Feedback', 'Age_Group', 'Income_Group',
                    'Vehicle_Category', 'Credit_Category']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    label_encoders[col] = le

print(f"Encoded {len(categorical_cols)} categorical columns")
print()


STEP 2: ENCODING CATEGORICAL VARIABLES
Encoded 14 categorical columns



In [10]:
df_train.head(5)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,...,Policy_Month,Policy_Day,Age_Group,Income_Group,Vehicle_Category,Credit_Category,Risk_Score,Income_Per_Dependent,Risk_Income,Log_Income_Per_Dep
0,19.0,0,10049.0,1,1.0,0,1,22.598761,2,2,...,12,23,0,1,3,3,97.401239,5024.5,978785.1,8.52228
1,39.0,0,31678.0,0,3.0,2,0,15.569731,0,1,...,6,12,2,2,2,2,94.430269,7919.5,2991362.0,8.97721
2,23.0,1,25602.0,0,3.0,1,1,47.177549,1,2,...,9,30,0,1,2,1,62.822451,6400.5,1608380.0,8.764288
3,21.0,1,141855.0,1,2.0,0,0,10.938144,0,0,...,6,12,0,3,1,3,99.061856,47285.0,14052420.0,10.76397
4,21.0,1,39651.0,2,1.0,0,1,20.376094,0,2,...,12,1,0,2,2,1,79.623906,19825.5,3157168.0,9.894775


In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1051855 entries, 0 to 1051854
Data columns (total 40 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   1051855 non-null  float64
 1   Gender                1051855 non-null  int64  
 2   Annual Income         1051855 non-null  float64
 3   Marital Status        1051855 non-null  int64  
 4   Number of Dependents  1051855 non-null  float64
 5   Education Level       1051855 non-null  int64  
 6   Occupation            1051855 non-null  int64  
 7   Health Score          1051855 non-null  float64
 8   Location              1051855 non-null  int64  
 9   Policy Type           1051855 non-null  int64  
 10  Previous Claims       1051855 non-null  float64
 11  Vehicle Age           1051855 non-null  float64
 12  Credit Score          1051855 non-null  float64
 13  Insurance Duration    1051855 non-null  float64
 14  Customer Feedback     1051855 non-

In [12]:
X = df_train.drop('Premium Amount', axis=1)
y = df_train['Premium Amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print()


X_train shape: (841484, 39)
X_test shape: (210371, 39)



In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaling completed")
print()


Scaling completed



In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
import numpy as np

print("Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

y_pred_train = lr_model.predict(X_train_scaled)
y_pred_test = lr_model.predict(X_test_scaled)

# Check for negative predictions
print(f"Min prediction: {y_pred_test.min():.2f}")
print(f"Max prediction: {y_pred_test.max():.2f}")
print(f"Negative predictions: {(y_pred_test < 0).sum()}")
print()

# Clip negative predictions to 0 for RMSLE calculation
y_pred_test_clipped = np.maximum(y_pred_test, 0)

# Metrics for test set
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred_test_clipped))
mae = mean_absolute_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

# Metrics for train set
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
r2_train = r2_score(y_train, y_pred_train)

print("="*50)
print("LINEAR REGRESSION RESULTS")
print("="*50)
print("\nTraining Set:")
print(f"  RMSE: {rmse_train:.2f}")
print(f"  R² Score: {r2_train:.4f}")

print("\nTest Set:")
print(f"  RMSE: {rmse:.2f}")
print(f"  RMSLE: {rmsle:.4f}")
print(f"  MAE: {mae:.2f}")
print(f"  R² Score: {r2:.4f}")
print("="*50)

Training Linear Regression...
Min prediction: -773.35
Max prediction: 1658.26
Negative predictions: 71

LINEAR REGRESSION RESULTS

Training Set:
  RMSE: 803.52
  R² Score: 0.0374

Test Set:
  RMSE: 801.70
  RMSLE: 1.1347
  MAE: 607.72
  R² Score: 0.0380


In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
import numpy as np

# Store results
results = []

# 1. Decision Tree
print("1. Training Decision Tree...")
dt_model = DecisionTreeRegressor(max_depth=15, min_samples_split=20, random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))
dt_rmsle = np.sqrt(mean_squared_log_error(y_test, np.maximum(dt_pred, 0)))
dt_mae = mean_absolute_error(y_test, dt_pred)
dt_r2 = r2_score(y_test, dt_pred)

results.append(['Decision Tree', dt_rmse, dt_rmsle, dt_mae, dt_r2])
print(f"   RMSE: {dt_rmse:.2f}, RMSLE: {dt_rmsle:.4f}, MAE: {dt_mae:.2f}, R²: {dt_r2:.4f}")

# 2. Random Forest
print("\n2. Training Random Forest...")
rf_model = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=10, 
                                  min_samples_leaf=4, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_rmsle = np.sqrt(mean_squared_log_error(y_test, np.maximum(rf_pred, 0)))
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

results.append(['Random Forest', rf_rmse, rf_rmsle, rf_mae, rf_r2])
print(f"   RMSE: {rf_rmse:.2f}, RMSLE: {rf_rmsle:.4f}, MAE: {rf_mae:.2f}, R²: {rf_r2:.4f}")

# 3. XGBoost
print("\n3. Training XGBoost...")
xgb_model = XGBRegressor(n_estimators=200, max_depth=10, learning_rate=0.05, 
                         subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_rmsle = np.sqrt(mean_squared_log_error(y_test, np.maximum(xgb_pred, 0)))
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

results.append(['XGBoost', xgb_rmse, xgb_rmsle, xgb_mae, xgb_r2])
print(f"   RMSE: {xgb_rmse:.2f}, RMSLE: {xgb_rmsle:.4f}, MAE: {xgb_mae:.2f}, R²: {xgb_r2:.4f}")

# Summary Table
print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print(f"{'Model':<20} {'RMSE':<12} {'RMSLE':<12} {'MAE':<12} {'R² Score':<12}")
print("-"*80)

for result in results:
    print(f"{result[0]:<20} {result[1]:<12.2f} {result[2]:<12.4f} {result[3]:<12.2f} {result[4]:<12.4f}")

print("="*80)

1. Training Decision Tree...
   RMSE: 797.56, RMSLE: 1.0995, MAE: 588.75, R²: 0.0479

2. Training Random Forest...
   RMSE: 783.54, RMSLE: 1.0939, MAE: 581.90, R²: 0.0811

3. Training XGBoost...
   RMSE: 783.60, RMSLE: 1.0955, MAE: 582.81, R²: 0.0809

MODEL COMPARISON SUMMARY
Model                RMSE         RMSLE        MAE          R² Score    
--------------------------------------------------------------------------------
Decision Tree        797.56       1.0995       588.75       0.0479      
Random Forest        783.54       1.0939       581.90       0.0811      
XGBoost              783.60       1.0955       582.81       0.0809      
