In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
df = pd.read_csv("F:\ML-PROJECTS\Regression\insurance.csv")

In [37]:
target_feature = "charges"

In [38]:
# Define the numerical and categorical columns

numeric_features = [feature for feature in df.columns if df[feature].dtype != "O"]
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']


# Print columns

print("We have {} numerical feature : {}".format(len(numeric_features),numeric_features))
print("We have {} categorical_feautures : {}".format(len(categorical_features),categorical_features))

We have 4 numerical feature : ['age', 'bmi', 'children', 'charges']
We have 3 categorical_feautures : ['sex', 'smoker', 'region']


In [39]:
df1 = df.copy()


In [40]:
 import statsmodels.api as sm

 import pandas

 from patsy import dmatrices


In [41]:
# Checking the multicolinearty of the data




for i in range(0,len(numeric_features)):
    x = df1.loc[:,df1.columns==numeric_features[i]]
    y = df1.charges

    model = sm.OLS(x,y)
    result = model.fit()
    rsq = result.rsquared
    vif = round(1/ (1-rsq), 2)

    print(
        "Varience Inflation Factor of {} column is {} \n".format(
            numeric_features[i],vif
        )
    )


Varience Inflation Factor of age column is 2.4 

Varience Inflation Factor of bmi column is 2.29 

Varience Inflation Factor of children column is 1.39 

Varience Inflation Factor of charges column is inf 



  vif = round(1/ (1-rsq), 2)


In [67]:
# Model Building


from sklearn.preprocessing import OneHotEncoder, StandardScaler,PowerTransformer,LabelEncoder, OrdinalEncoder , RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV,train_test_split,GridSearchCV
from xgboost import XGBRegressor
from catboost import Pool, CatBoostRegressor
from sklearn.svm import SVR


In [43]:
df1.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [44]:
X = df.drop(columns=['charges'], axis=1)
y =np.abs(df['charges'])

In [11]:
y = np.log1p(y)

In [71]:
numeric_features_pipline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())

])

categorical_features_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
])



preprocessor = ColumnTransformer(
    [
        ('Numeric Pipeline', numeric_features_pipline,numeric_features),
        ('categorical features pipeline', categorical_features_pipeline, categorical_features)
    ]

)

In [72]:
print(f"These are categorical column {categorical_features}")
print(numeric_features)

These are categorical column ['sex', 'smoker', 'region']
['age', 'bmi']


In [73]:
numeric_features=numeric_features[0:2]

In [74]:
numeric_features

['age', 'bmi']

In [75]:
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [76]:
X_train.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [77]:
X_test.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [78]:
numeric_features

['age', 'bmi']

In [79]:
print("Columns in X_train:")
print(X_train.columns)


Columns in X_train:
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')


In [80]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,female,19.95,2,no,northwest
1285,47,female,24.32,0,no,northeast
1142,52,female,24.86,0,no,southeast
969,39,female,34.32,5,no,southeast
486,54,female,21.47,3,no,northwest


In [81]:
X_train= preprocessor.fit_transform(X_train)

In [82]:
X_test = preprocessor.fit_transform(X_test)

In [21]:
print(X_train.columns)

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')


In [56]:
# Model Selection

def evaluate_req(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae, rmse, r2_score

In [68]:
models = {
    "LinearRegression": LinearRegression(),
    "K-Neigobrs": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose= False, max_depth=5),
    "Adaboost Regressor": AdaBoostRegressor(),
    "SVR": SVR()
}

In [83]:
# Fuction which can evaluate models and return a report

def evaluate_models(X_train,X_test,y_train, y_test, models):
    """
    This function takes in  X and y models dictory as input
    it splits the data into train test split
    iterates through the given model dictonary and evaluates the metrics
    Retruns : Dataframe which contains report of all models metrics with cost
    """

    models_list = []

    r2_list = []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train,X_test)

        # Make the prediction

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Eualuate the train and test dataset

        model_train_mae , model_train_rmse , model_train_r2 = evaluate_req(y_train,y_train_pred)

        model_test_mae, model_test_rmse, model_test_r2 = evaluate_req(y_test,y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print("Model performance for training set")
        print("Root Mean squared Error : {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        r2_list.append(model_test_r2)

        print("="*35)
        print('\n')

    report = pd.DataFrame(list(zip(models_list, r2_list)), columns=['Model Name','r2_score']).sort_values(by['r2_score'],ascending=True)

    return report       

In [84]:
base_report = evaluate_models(X_train,X_test,y_train,y_test,models)

ValueError: could not convert string to float: 'female'

In [86]:
preprocessor