In [77]:
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score

In [45]:
df = pd.read_csv("src/insurance.csv")

# Check for null values in dataset

In [46]:
print("Taille du df : ", df.shape, "\n")
print("Présence de valeur nul:", df.isnull().any())

Taille du df :  (1338, 7) 

Présence de valeur nul: age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool


In [66]:
X = df.drop("charges", axis=1)
y = df["charges"]
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [59]:
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
categorical_cols

['sex', 'smoker', 'region']

In [49]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

""" Pour Utiliser cette function mettre 'from pipeline import pipeline_create'
"""
def pipeline_create(X_train, model):  

    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()


    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
    ])

    # Categorical pipeline
    cat_pipeline = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

    # Full preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, numerical_cols),
            ('cat', cat_pipeline, categorical_cols),
        ])

    full_pipeline = make_pipeline(preprocessor, model)
    return full_pipeline

In [50]:
kf = KFold(n_splits=5, shuffle = True, random_state=42)

In [68]:
param_grid = {"fit_intercept" : [True, False]}
lr_model = LinearRegression()
model = pipeline_create(X, lr_model)


In [67]:
model = GridSearchCV(model, cv=kf, param_grid=param_grid)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
model.fit(X_train, y_train)

In [75]:
y_pred = model.predict(X_test)

In [78]:
root_mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)

0.7835929767120723