In [2]:
# Mount Google Drive in Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings("ignore")

In [5]:
# load data (Select on folder in left pane and you should see your drive)
filepath = "/content/drive/MyDrive/claims_data (1).csv"
df = pd.read_csv(filepath)

In [11]:
# Displaying basic info about dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\Column names:")
print(df.columns.tolist())
print("\nTarget variable distribution:")
print(df['loss'].describe())

Dataset shape: (188318, 132)

First few rows:
   id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9  ...     cont6     cont7  \
0   1    A    B    A    B    A    A    A    A    B  ...  0.718367  0.335060   
1   2    A    B    A    A    A    A    A    A    B  ...  0.438917  0.436585   
2   5    A    B    A    A    B    A    A    A    B  ...  0.289648  0.315545   
3  10    B    B    A    B    A    A    A    A    B  ...  0.440945  0.391128   
4  11    A    B    A    B    A    A    A    A    B  ...  0.178193  0.247408   

     cont8    cont9   cont10    cont11    cont12    cont13    cont14     loss  
0  0.30260  0.67135  0.83510  0.569745  0.594646  0.822493  0.714843  2213.18  
1  0.60087  0.35127  0.43919  0.338312  0.366307  0.611431  0.304496  1283.60  
2  0.27320  0.26076  0.32446  0.381398  0.373424  0.195709  0.774425  3005.09  
3  0.31796  0.32128  0.44467  0.327915  0.321570  0.605077  0.602642   939.85  
4  0.24564  0.22089  0.21230  0.204687  0.202213  0.246011  0.432606  2763.85  

In [None]:
# Data Preprocessing
def preprocess_data(df):
    data = df.copy()  # To avoid modifying original data

    # Separating features and target
    X = data.drop('loss', axis=1)
    y = data['loss']

    # Identify categorical and numerical columns
    cat_cols = [col for col in X.columns if col.startswith('cat')]
    cont_cols = [col for col in X.columns if col.startswith('cont')]

    print(f"Categorical columns: {len(cat_cols)}")
    print(f"Continuous columns: {len(cont_cols)}")

    # Handling categorical variables by encoding
    label_encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        # Handling any new categories that might appear in test data by using astype(str)
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

    # Handling missing values - FIXED THIS LINE
    X = X.fillna(X.mean())
    for col in cat_cols:
        X[col] = X[col].fillna(X[col].mode()[0]) if len(X[col].mode()) > 0 else 0

    return X, y, cat_cols, cont_cols, label_encoders

# Preprocessing data
X, y, cat_cols, cont_cols, label_encoders = preprocess_data(df)

print(f"Final feature matrix shape: {X.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale continuous features - FIXED THIS LINE
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()  # FIXED: Removed asterisk

# Scale only continuous features
if cont_cols:
    cont_indices = [X.columns.get_loc(col) for col in cont_cols]
    X_train_scaled.iloc[:, cont_indices] = scaler.fit_transform(X_train.iloc[:, cont_indices])
    X_test_scaled.iloc[:, cont_indices] = scaler.transform(X_test.iloc[:, cont_indices])

# Basic Random Forest Model
print("\nTraining Basic Random Forest Model...")
rf_basic = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
rf_basic.fit(X_train, y_train)

# Predictions
y_pred_basic = rf_basic.predict(X_test)

# Evaluate Basic model
mse_basic = mean_squared_error(y_test, y_pred_basic)
rmse_basic = np.sqrt(mse_basic)
mae_basic = mean_absolute_error(y_test, y_pred_basic)
r2_basic = r2_score(y_test, y_pred_basic)

print("\nBasic Random Forest Performance:")
print(f"RMSE: {rmse_basic:.2f}")
print(f"MAE: {mae_basic:.2f}")
print(f"R^2 Score: {r2_basic:.4f}")

# Hyperparameter Tuning with GridSearchCV - FIXED: Added missing backslash
print("\nPerforming Hyperparameter Tuning...")
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

rf_tuned = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use a smaller subset for faster grid search if dataset is large
if len(X_train) > 1000:
    X_train_sample, _, y_train_sample, _ = train_test_split(
        X_train, y_train, train_size=1000, random_state=42
    )
else:
    X_train_sample, y_train_sample = X_train, y_train

grid_search = GridSearchCV(
    rf_tuned, param_grid, cv=3, scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)
grid_search.fit(X_train_sample, y_train_sample)

print(f"Best parameters: {grid_search.best_params_}")

# Train final model with best parameters
rf_final = grid_search.best_estimator_
rf_final.fit(X_train, y_train)

# Predictions with tuned model
y_pred_final = rf_final.predict(X_test)

# Evaluate the tuned model
mse_final = mean_squared_error(y_test, y_pred_final)
rmse_final = np.sqrt(mse_final)
mae_final = mean_absolute_error(y_test, y_pred_final)
r2_final = r2_score(y_test, y_pred_final)

print("\nTuned Random Forest Performance:")
print(f"RMSE: {rmse_final:.2f}")
print(f"MAE: {mae_final:.2f}")
print(f"R² Score: {r2_final:.4f}")

# Feature Importance Analysis
print("\nAnalyzing Feature Importance...")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance.head(20))

# Visualization
plt.figure(figsize=(15, 10))

# 1. Feature Importance Plot
plt.subplot(2, 2, 1)
top_features = feature_importance.head(15)
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Most Important Features')
plt.gca().invert_yaxis()

# 2. Actual vs Predicted Plot
plt.subplot(2, 2, 2)
plt.scatter(y_test, y_pred_final, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Loss')
plt.ylabel('Predicted Loss')
plt.title(f'Actual vs Predicted (R² = {r2_final:.3f})')

# 3. Residual Plot
plt.subplot(2, 2, 3)
residuals = y_test - y_pred_final
plt.scatter(y_pred_final, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')

# 4. Loss Distribution
plt.subplot(2, 2, 4)
plt.hist(y, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Loss Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Loss Amounts')

plt.tight_layout()
plt.show()

# Cross-validation scores
print("\nPerforming Cross-Validation...")
cv_scores = cross_val_score(rf_final, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)

print(f"Cross-Validation RMSE scores: {cv_rmse_scores}")
print(f"Mean CV RMSE: {cv_rmse_scores.mean():.2f} (+/- {cv_rmse_scores.std() * 2:.2f})")

# Permutation Importance (more robust feature importance)
print("\nCalculating Permutation Importance...")
perm_importance = permutation_importance(
    rf_final, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
)

perm_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': perm_importance.importances_mean
}).sort_values('importance', ascending=False)

print("\nTop 10 Features by Permutation Importance:")
print(perm_importance_df.head(10))

# Analyze categorical vs continuous feature importance
cat_importance = feature_importance[feature_importance['feature'].isin(cat_cols)]['importance'].sum()
cont_importance = feature_importance[feature_importance['feature'].isin(cont_cols)]['importance'].sum()

print(f"\nCategorical Features Total Importance: {cat_importance:.3f}")
print(f"Continuous Features Total Importance: {cont_importance:.3f}")

# Model interpretation for business insights
print("\n" + "="*50)
print("BUSINESS INSIGHTS")
print("="*50)

# Top categorical features
top_cat_features = feature_importance[feature_importance['feature'].isin(cat_cols)].head(5)
print("\nTop 5 Most Important Categorical Features:")
for _, row in top_cat_features.iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Top continuous features
top_cont_features = feature_importance[feature_importance['feature'].isin(cont_cols)].head(5)
print("\nTop 5 Most Important Continuous Features:")
for _, row in top_cont_features.iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Save important features to CSV
feature_importance.to_csv('random_forest_feature_importance.csv', index=False)
print("\nFeature importance saved to 'random_forest_feature_importance.csv'")

# Model summary
print("\n" + "="*50)
print("MODEL SUMMARY")
print("="*50)
print(f"Final R² Score: {r2_final:.4f}")
print(f"Final RMSE: {rmse_final:.2f}")
print(f"Final MAE: {mae_final:.2f}")
print(f"Number of features used: {len(X.columns)}")
print(f"Number of trees in forest: {rf_final.n_estimators}")

# Save the trained model (optional)
import joblib
joblib.dump(rf_final, 'random_forest_claims_model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
print("\nModel and preprocessing objects saved to disk.")




Categorical columns: 116
Continuous columns: 14
Final feature matrix shape: (188318, 131)
Training set: (150654, 131)
Test set: (37664, 131)

Training Basic Random Forest Model...


# New Section