In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv(r"D:\prakash\Smart_Premium_New\playground-series-s4e12 (3)\train.csv")

In [3]:
# Check current dataset size
print(f"Original dataset size: {df_train.shape}")
print()

# Calculate premium to income ratio
df_train['Premium_to_Income_Ratio'] = (df_train['Premium Amount'] / df_train['Annual Income']) * 100

# Remove records where Premium > 50% of Annual Income
df_train_cleaned = df_train[df_train['Premium_to_Income_Ratio'] <= 50].copy()

# Drop the temporary ratio column
df_train_cleaned = df_train_cleaned.drop('Premium_to_Income_Ratio', axis=1)
df_train = df_train.drop('Premium_to_Income_Ratio', axis=1)

print(f"Cleaned dataset size: {df_train_cleaned.shape}")
print(f"Removed records: {len(df_train) - len(df_train_cleaned)}")
print()

# Update df_train to cleaned version
df_train = df_train_cleaned.copy()

print(f"Final dataset size: {df_train.shape}")

Original dataset size: (1200000, 21)

Cleaned dataset size: (1051855, 21)
Removed records: 148145

Final dataset size: (1051855, 21)


In [4]:
import pandas as pd

# For numerical columns → replace missing values with median
num_cols = df_train.select_dtypes(include=['number']).columns
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].median())

# For categorical columns → replace missing values with mode (most frequent value)
cat_cols = df_train.select_dtypes(exclude=['number']).columns
df_train[cat_cols] = df_train[cat_cols].apply(lambda x: x.fillna(x.mode()[0]))

print("✅ Missing values handled successfully!")


✅ Missing values handled successfully!


In [5]:
df_train.isnull().sum()

id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
dtype: int64

In [6]:
# Take a copy of 'id' column before dropping it
id_copy = df_train['id'].copy()

print("✅ ID column copied successfully!")
print(id_copy.head())


✅ ID column copied successfully!
0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64


In [7]:
df_train = df_train.drop(columns=['id'])

In [8]:
df_train['Policy Start Date'] = pd.to_datetime(df_train['Policy Start Date'], errors='coerce')

In [9]:
df_train['Policy_Year'] = df_train['Policy Start Date'].dt.year
df_train['Policy_Month'] = df_train['Policy Start Date'].dt.month
df_train['Policy_Day'] = df_train['Policy Start Date'].dt.day

In [10]:
df_train = df_train.drop(columns=['Policy Start Date'])

In [11]:
df_train.duplicated().sum()

np.int64(0)

In [None]:
# from sklearn.preprocessing import LabelEncoder

# # Identify categorical columns
# cat_cols = df_train.select_dtypes(include=['object']).columns

# # Initialize label encoder
# le = LabelEncoder()

# # Apply label encoding to each categorical column
# for col in cat_cols:
#     df_train[col] = le.fit_transform(df_train[col])

# print("✅ All categorical columns encoded successfully!")
# print("Encoded columns:", list(cat_cols))





✅ All categorical columns encoded successfully!
Encoded columns: ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']


In [11]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Identify categorical columns
cat_cols = df_train.select_dtypes(include=['object']).columns

# Dictionary to store encoders
label_encoders = {}

# Apply label encoding to each categorical column
for col in cat_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    label_encoders[col] = le   # store encoder for that column

print("✅ All categorical columns encoded successfully!")
print("Encoded columns:", list(cat_cols))

# Save all encoders in one .pkl file
joblib.dump(label_encoders, "label_encoders.pkl")
print("💾 Saved all label encoders to label_encoders.pkl")


✅ All categorical columns encoded successfully!
Encoded columns: ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
💾 Saved all label encoders to label_encoders.pkl


In [13]:
# Select only numeric columns
num_cols = df_train.select_dtypes(include=['number']).columns

# Loop through each numeric column
for col in num_cols:
    Q1 = df_train[col].quantile(0.25)
    Q3 = df_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df_train[(df_train[col] < lower_bound) | (df_train[col] > upper_bound)]

    print(f"{col}: {len(outliers)} outliers")


Age: 0 outliers
Gender: 0 outliers
Annual Income: 70612 outliers
Marital Status: 0 outliers
Number of Dependents: 0 outliers
Education Level: 0 outliers
Occupation: 0 outliers
Health Score: 0 outliers
Location: 0 outliers
Policy Type: 0 outliers
Previous Claims: 52509 outliers
Vehicle Age: 0 outliers
Credit Score: 0 outliers
Insurance Duration: 0 outliers
Customer Feedback: 0 outliers
Smoking Status: 0 outliers
Exercise Frequency: 0 outliers
Property Type: 0 outliers
Premium Amount: 48924 outliers
Policy_Year: 0 outliers
Policy_Month: 0 outliers
Policy_Day: 0 outliers


In [12]:
import numpy as np

# List of columns with outliers
cols_with_outliers = ['Annual Income', 'Previous Claims']

# Apply log(1 + x) transform to handle zeros safely
for col in cols_with_outliers:
    df_train[col] = np.log1p(df_train[col])

print("✅ Log transformation applied to outlier columns successfully!")


✅ Log transformation applied to outlier columns successfully!


In [13]:
from sklearn.model_selection import train_test_split


X = df_train.drop(columns=['Premium Amount'])
y = df_train['Premium Amount']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      
    random_state=42     
)

In [14]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [15]:
import joblib

# Save the fitted scaler to a file
joblib.dump(scaler, "scaler.pkl")

print("✅ Scaler saved successfully as 'scaler.pkl'")


✅ Scaler saved successfully as 'scaler.pkl'


In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

lr_model = LinearRegression()

# Training the model on scaled training data
lr_model.fit(X_train_scaled, y_train)

# Make predictions on scaled test data
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae = mean_absolute_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)

print("✅ Linear Regression Model Results:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")


✅ Linear Regression Model Results:
Root Mean Squared Error (RMSE): 809.5278
Mean Absolute Error (MAE): 616.4562
R² Score: 0.0191


In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize the Decision Tree model with hyperparameters to prevent overfitting
dt_model = DecisionTreeRegressor(
    max_depth=10,            # Limit tree depth
    min_samples_split=20,    # Minimum samples required to split a node
    min_samples_leaf=10,     # Minimum samples required in a leaf node
    random_state=42
)

# Train the model on original (unscaled) training data
dt_model.fit(X_train, y_train)

# Make predictions on original (unscaled) test data
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred_dt))
mae = mean_absolute_error(y_test, y_pred_dt)
r2 = r2_score(y_test, y_pred_dt)

print("✅ Decision Tree Regression Model Results:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")


✅ Decision Tree Regression Model Results:
Root Mean Squared Error (RMSE): 784.4845
Mean Absolute Error (MAE): 581.9518
R² Score: 0.0788


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


rf_model = RandomForestRegressor(
    n_estimators=100,        # Number of trees in the forest
    max_depth=10,            # Maximum depth of each tree
    min_samples_split=20,    # Minimum samples required to split a node
    min_samples_leaf=10,     # Minimum samples required in a leaf node
    random_state=42,
    n_jobs=-1                # Use all CPU cores for faster training
)

# Train the model on original (unscaled) training data
rf_model.fit(X_train, y_train)

# Make predictions on original (unscaled) test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae = mean_absolute_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print("✅ Random Forest Regression Model Results:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")


✅ Random Forest Regression Model Results:
Root Mean Squared Error (RMSE): 782.5511
Mean Absolute Error (MAE): 580.9309
R² Score: 0.0834


In [18]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

xgb_model = XGBRegressor(
    n_estimators=200,        # Number of boosting rounds (trees)
    learning_rate=0.05,      # Step size shrinkage
    max_depth=10,            # Maximum depth of each tree
    subsample=0.8,           # Randomly sample 80% of data for each tree
    colsample_bytree=0.8,    # Use 80% of features per tree
    reg_lambda=1.0,          # L2 regularization
    random_state=42,
    n_jobs=-1,               # Use all CPU cores
    verbosity=0              # Suppress training logs
)

# Train the model on original (unscaled) training data
xgb_model.fit(X_train, y_train)

# Make predictions on original (unscaled) test data
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae = mean_absolute_error(y_test, y_pred_xgb)
r2 = r2_score(y_test, y_pred_xgb)

print("✅ XGBoost Regression Model Results:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")


✅ XGBoost Regression Model Results:
Root Mean Squared Error (RMSE): 783.6993
Mean Absolute Error (MAE): 583.6715
R² Score: 0.0807


In [22]:
import joblib

# ✅ Save the best Random Forest model
joblib.dump(rf_model, "randomforest_model.pkl")


print("✅ Random Forest model saved successfully!")


✅ Random Forest model saved successfully!


In [None]:
rf_model = joblib.load("randomforest_model.pkl")

In [23]:
import joblib

joblib.dump(X.columns.tolist(), "regression_feature_cols.pkl")
print("✅ Feature columns saved as regression_feature_cols.pkl")


✅ Feature columns saved as regression_feature_cols.pkl


In [19]:
rf_model = joblib.load("randomforest_model.pkl")
feature_cols = joblib.load("regression_feature_cols.pkl")

In [20]:
X_test = df_train.reindex(columns=feature_cols, fill_value=0)


In [21]:
predictions = rf_model.predict(X_test)


In [22]:
submission = pd.DataFrame({
    "id": id_copy,
    "Premium Amount": np.round(predictions, 6)
})

submission.to_csv("submission.csv", index=False)
print("📁 submission.csv created successfully!")
print(submission.head(10))


📁 submission.csv created successfully!
   id  Premium Amount
0   0     1216.513681
1   1     1052.359743
2   2     1107.406885
3   3     1107.096844
4   4     1065.489001
5   5     1103.737773
6   6     1102.322442
7   7     1111.169333
8   8      316.155418
9   9      256.029272
