In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

insurance2 = pd.read_csv('insurance2.csv')
insurance3r2 = pd.read_csv('insurance3r2.csv')

print("insurance2 dataset:")
print(insurance2.head())

print("\ninsurance3r2 dataset:")
print(insurance3r2.head())

df = pd.concat([insurance2, insurance3r2], ignore_index=True)

print("\nMissing values in each column:")
print(df.isnull().sum())

print("\nData types of columns:")
print(df.dtypes)

print("\nColumn names:")
print(df.columns)  
target_column = 'ClaimApproved'  
if target_column not in df.columns:
    raise KeyError(f"Target column '{target_column}' not found in the DataFrame")

X = df.drop(columns=[target_column])
y = df[target_column]


categorical_features = ['Gender', 'MaritalStatus', 'Occupation'] 
numerical_features = ['Age', 'Income', 'ClaimAmount']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


insurance2 dataset:
   age  sex     bmi  children  smoker  region      charges  insuranceclaim
0   19    0  27.900         0       1       3  16884.92400               1
1   18    1  33.770         1       0       2   1725.55230               1
2   28    1  33.000         3       0       2   4449.46200               0
3   33    1  22.705         0       0       1  21984.47061               0
4   32    1  28.880         0       0       1   3866.85520               1

insurance3r2 dataset:
   age  sex     bmi  steps  children  smoker  region      charges  \
0   19    0  27.900   3009         0       1       3  16884.92400   
1   18    1  33.770   3008         1       0       2   1725.55230   
2   28    1  33.000   3009         3       0       2   4449.46200   
3   33    1  22.705  10009         0       0       1  21984.47061   
4   32    1  28.880   8010         0       0       1   3866.85520   

   insuranceclaim  
0               1  
1               1  
2               0  
3           

KeyError: "Target column 'ClaimApproved' not found in the DataFrame"