In [2]:
#import modules and load the data
import pandas as pd
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from sklearn.preprocessing import MinMaxScaler

url = 'https://raw.githubusercontent.com/Patrick0481/Intro-to-modeling/refs/heads/main/insurance.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
#Prepare the data
df = df.dropna() #remove missings
y = df['charges'] #select Target
X = df.drop('charges',axis=1) #Select features
X = pd.get_dummies(X, drop_first=True) #Turn sex, smoker, and region into dummy variables


In [6]:
#Prediction with all features
#define the cross-validation
cv = RepeatedKFold(n_splits=5, #splits the data into 5 folds
                   random_state=420) #adding a number here ensures it will be the same each time

#predict with linear regression
lm = LinearRegression()
lmscores = cross_val_score(lm, X, y, #these define the type of model, features, and target
                           scoring='neg_mean_absolute_error', #test using mean absolute deviation
                           cv=cv) #this loads the cross validation we wrote above


lmMAE = mean(absolute(lmscores)) #Calculate the overall mean absolute error
print('the average prediction error with full data is: %.0f' % lmMAE) #Print the result

the average prediction error with full data is: 4213


In [5]:
#In class exercise 1

#Model without the sex variable
Xnosex = X.drop('sex_male',axis=1) #drop the sex variable
lmscoresnc = cross_val_score(lm, Xnosex, y, #these define the type of model, features, and target
                           scoring='neg_mean_absolute_error', #test using mean absolute deviation
                           cv=cv) #this loads the cross validation we wrote above

#Calculate the Mean Average Error
lmMAEnc = mean(absolute(lmscoresnc)) #Calculate the overall mean absolute error
print('the average prediction error will cut data is: %.0f' % lmMAEnc) #Print the result

the average prediction error will cut data is: 4209


In [16]:
#Lasso and Ridge: step 1 normalizing the data
columns = X.columns #create index with column names (needed for last step)
scaler = MinMaxScaler() #initiate the scaler
X = scaler.fit_transform(X) #scale the data
X = pd.DataFrame(X,columns=columns) #turn back into a dataframe

In [17]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
LassoModel = Lasso(alpha=10)
RidgeModel = Ridge(alpha=10)

#Run the cross fold validation again (same as earlier)
scoreslasso = cross_val_score(LassoModel, X, y, scoring='neg_mean_absolute_error', cv=cv) 
scoresridge = cross_val_score(RidgeModel, X, y, scoring='neg_mean_absolute_error', cv=cv) 

#Evaluate the model
print('the average prediction error with linear regression was: 4213')
lassoMAE = mean(absolute(scoreslasso)) #Calculate the overall mean absolute error
print('the average prediction error with lasso is: %.0f' % lassoMAE) #Print the result
ridgeMAE = mean(absolute(scoresridge)) #Calculate the overall mean absolute error
print('the average prediction error with ridge is: %.0f' % ridgeMAE) #Print the result

the average prediction error with linear regression was: 4213
the average prediction error with lasso is: 4200
the average prediction error with ridge is: 4223
