In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [2]:
# Step 1: Load and preprocess the data
data = pd.read_csv('PreProcessed.csv')

X = data[['age', 'bmi', 'smoker_yes']]
y = data['charges']

In [6]:
# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
# Step 3: Perform SelectKBest feature selection
selector = SelectKBest(f_regression, k=2)  # Select top 2 features based on f_regression score
X_train_kbest = selector.fit_transform(X_train, y_train)
X_test_kbest = selector.transform(X_test)

In [8]:
# Step 4: Create a regression model using the selected features from SelectKBest
model_kbest = RandomForestRegressor(n_estimators=10, random_state=0)

In [9]:
# Step 5: Train the model on the training data with selected features from SelectKBest
model_kbest.fit(X_train_kbest, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [10]:
# Step 6: Evaluate the model on the testing data with selected features from SelectKBest
y_pred_kbest = model_kbest.predict(X_test_kbest)
mse_kbest = mean_squared_error(y_test, y_pred_kbest)
r2_kbest = r2_score(y_test, y_pred_kbest)

In [11]:
# Print model evaluation metrics for SelectKBest
print('SelectKBest:')
print('Mean Squared Error:', mse_kbest)
print('R-squared:', r2_kbest)

SelectKBest:
Mean Squared Error: 44798665.88170708
R-squared: 0.7184774933144176


In [12]:
# Step 7: Save the trained SelectKBest model using pickle
with open('insurance_charges_model_kbest.pkl', 'wb') as file:
    pickle.dump(model_kbest, file)

In [13]:
# Step 8: Perform Recursive Feature Elimination (RFE) feature selection
rfe = RFE(RandomForestRegressor(n_estimators=10, random_state=0), n_features_to_select=2)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

In [14]:
# Step 9: Create a regression model using the selected features from RFE
model_rfe = RandomForestRegressor(n_estimators=10, random_state=0)

In [15]:
# Step 10: Train the model on the training data with selected features from RFE
model_rfe.fit(X_train_rfe, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [16]:
# Step 11: Evaluate the model on the testing data with selected features from RFE
y_pred_rfe = model_rfe.predict(X_test_rfe)
mse_rfe = mean_squared_error(y_test, y_pred_rfe)
r2_rfe = r2_score(y_test, y_pred_rfe)

In [17]:
# Print model evaluation metrics for RFE
print('Recursive Feature Elimination (RFE):')
print('Mean Squared Error:', mse_rfe)
print('R-squared:', r2_rfe)

Recursive Feature Elimination (RFE):
Mean Squared Error: 43002495.791517116
R-squared: 0.7297649344975807


In [18]:
# Step 12: Save the trained RFE model using pickle
with open('insurance_charges_model_rfe.pkl', 'wb') as file:
    pickle.dump(model_rfe, file)

In [19]:
# Step 13: Perform Feature Importance
model_importance = RandomForestRegressor(n_estimators=10, random_state=0)
model_importance.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [20]:
# Step 14: Evaluate the model on the testing data
y_pred_importance = model_importance.predict(X_test)
mse_importance = mean_squared_error(y_test, y_pred_importance)
r2_importance = r2_score(y_test, y_pred_importance)

In [21]:
# Print model evaluation metrics for Feature Importance
print('Feature Importance:')
print('Mean Squared Error:', mse_importance)
print('R-squared:', r2_importance)

Feature Importance:
Mean Squared Error: 24775764.083663587
R-squared: 0.8443048454098783


In [23]:
# Step 15: Save the trained Feature Importance model using pickle
with open('finalized_model.pkl', 'wb') as file:
    pickle.dump(model_importance, file)