In [None]:
import pandas as pd
import numpy as np

In [None]:
#read the data from the csv file into a dataframe object
dataset = pd.read_csv('insurance.csv')

#the data set comes from here:
#https://github.com/stedy/Machine-Learning-with-R-datasets
# It records age, sex, body mass index (bmi), nr. of children, whether a person smokes, where in the US they live,
# and the costs of their health insurance
# take a look at the raw csv file or here:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
#dataframes have methods producing the standard statistical numbers associated to their data,
#e.g. mean and standard deviation:
print(dataset.describe())
# here only the numerical columns are shown because 'mean' etc. makes no sense for the other columns.

#-----see the dataframe documentation for more
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [None]:
#produce new dataframes by selecting some columns from the given ones
X = dataset[['age', 'bmi', 'children']]
y = dataset['charges']
#we will suppose that the numbers in y depend linearly on those in X

In [None]:
X.head()

Unnamed: 0,age,bmi,children
0,19,27.9,0
1,18,33.77,1
2,28,33.0,3
3,33,22.705,0
4,32,28.88,0


In [None]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [None]:
#create a LinearRegression object, then invoke its methods on our data
from sklearn.linear_model import LinearRegression
our_model = LinearRegression()

#The method fit() performs a linear regression, i.e. computes the 'best' linear function taking arguments as in X that approximates y
#in the following the list of 4-tuples of numbers X is interpreted as inputs, the list of numbers y as outputs. This is taken as training data.
our_model.fit(X, y)

#the coefficients of our linear model are stored in our_model.coef_
#we put this information into a new dataframe (for no good reason, just so you see how it's done):
coeff_df = pd.DataFrame(our_model.coef_, X.columns, columns=['Coefficient'])
print(coeff_df)

#It should look like this:
#           Coefficient
#age        239.994474
#bmi        332.083365
#children   542.864652
#
#
#This means that our approximate function is
#  charges = 239.994474*age + 332.083365*bmi + 542.864652*children + some constant


          Coefficient
age        239.994474
bmi        332.083365
children   542.864652


In [None]:
# Here we didn't make use of the non-numerical columns, but surely they influence the insurance costs
# To take this into account we encode them by numbers:
def sex_numerical(s):
    if s=='female':
        return 1
    elif s=='male':
        return 0

def smoker_numerical(s):
    if s=='yes':
        return 1
    elif s=='no':
        return 0
    else:
        print("Schrödinger's smoker")

X = dataset.copy()[['age', 'bmi', 'children', 'smoker', 'sex']]
X['sex']=[sex_numerical(x) for x in dataset['sex']]
X['smoker']=[smoker_numerical(x) for x in dataset['smoker']]
X.head()

Unnamed: 0,age,bmi,children,smoker,sex
0,19,27.9,0,1,1
1,18,33.77,1,0,0
2,28,33.0,3,0,0
3,33,22.705,0,0,0
4,32,28.88,0,0,0


In [None]:
our_model.fit(X, y)

#the coefficients of our linear model are stored in our_model.coef_
#we put this information into a new dataframe
coeff_df = pd.DataFrame(our_model.coef_, X.columns, columns=['Coefficient'])
print(coeff_df)

           Coefficient
age         257.734988
bmi         322.364214
children    474.411121
smoker    23823.392531
sex         128.639854


This means that our approximate function is
  $$charge = 257.734988*age + 322.364214*bmi + 474.411121*children + 23823.392531*smoker + 128.639854*sex + some\  constant$$
 Think about what these coefficients tell us:
 e.g. Smoking drives up your insurane cost a lot
 and that the last coefficient is positive tells us that women pay more

In [None]:
# The predictions of our model can be shown with the .predict() function.
# you can pass any data matrix to the function - here we just took
# the given data to compare with the actual values
print('Model predictions: ',our_model.predict(X))
# Actual values for comparison:
print('Actual values:',y)
# Summary: The order of magnitude of the predictions is ok, but it's not super precise

Model predictions:  [25661.85689486  3818.77858262  7096.73025538 ...  4465.88909619
  1676.96948913 36863.89250792]
Actual values: 0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


In [None]:
# find out the constant term of our model:
print(our_model.predict([[0,0,0,0,0]]))

[-12181.10183923]




In [None]:
#Check that the coefficients really are used as claimed above
predictions=-12181.10183923 + 257.734988*X['age'] + 322.364214*X['bmi'] + 474.411121*X['children'] + 23823.392531*X['smoker'] + 128.639854*X['sex']
predictions
# The numbers should be the same as those from the model predictions above.

0       25661.856888
1        3818.778573
2        7096.730250
3        3643.432244
4        5376.296277
            ...     
1333    12112.500631
1334     2876.633510
1335     4465.889085
1336     1676.969484
1337    36863.892515
Length: 1338, dtype: float64

## (a) also use the location data (4 points)

$\to$ Your task: Now also use the location data for the linear model

 Again you will have to encode it by numbers. For 'sex' and 'smoker' we only had two possible values each, which we could encode by 0 or 1. In this way in the linear model we get an extra coefficient which e.g. is used for smokers (value 1) but not for non-smokers.

1) For the region we have several possible values. Find out how many - by programming not or by just looking into the .csv file.

2) If there are n possible values for 'region', you could create n new columns whose value you set to 1 if that is the region, and to 0 if not. This is slightly redundant: You can also get away with n-1 columns. Then there will be one region without an assigned column, and if all region values are 0 it means it is that left out region. This is what we did with 'sex' and 'smoker'. Choose either way.

3) Check whether the predictions get better

In [None]:
regions = dataset["region"].unique()
print(regions)
# Create new columns for each region
for region in regions:
    X[region] = (dataset['region'] == region).astype(int)

# Drop one column to avoid multicollinearity
X = X.drop(regions[0], axis=1)

# Display the modified dataset
X.head()

['southwest' 'southeast' 'northwest' 'northeast']


Unnamed: 0,age,bmi,children,smoker,sex,southeast,northwest,northeast
0,19,27.9,0,1,1,0,0,0
1,18,33.77,1,0,0,1,0,0
2,28,33.0,3,0,0,1,0,0
3,33,22.705,0,0,0,0,1,0
4,32,28.88,0,0,0,0,1,0


In [None]:
our_model.fit(X, y)
print('Model predictions: ',our_model.predict(X))
# Actual values for comparison:
print('Actual values:',y)
# Summary: The order of magnitude of the predictions is ok, but it's not super precise

Model predictions:  [25293.7130284   3448.60283431  6706.9884907  ...  4149.13248568
  1246.58493898 37085.62326757]
Actual values: 0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


## (b) Linear regression via pseudoinverse (2 points)

 $\to$ Your task: Do the linear regression again by computing a pseudoinverse

1)  A linear model also has a constant term (it is an affine function in math terminology).
The data matrix would encode a linear function, i.e. with constant term 0.

To acommodate the constant term, you should first add  a column of 1s to the dataframe - think about why this will produce a constant! You can use the dataframe methods .insert() or .assign()
 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.insert.html

2) There is a method computing pseudoinverses:  numpy.linalg.pinv
Compare the coefficients you obtain from this second regression method with the ones from above. They should be equal, if you use the version with $n-1$ new columns (where $n$ is the number of possible values of 'region')!

In [None]:
coeff_df = pd.DataFrame(our_model.coef_, X.columns, columns=['Coefficient'])
print(coeff_df)


            Coefficient
age          256.856353
bmi          339.193454
children     475.500545
smoker     23848.534542
sex          131.314359
southeast    -74.971058
northwest    607.087092
northeast    960.050991


In [None]:
#X.insert(loc=0, column='constant', value=1)
W = np.linalg.pinv(X.to_numpy())@y
coef_= W[:-1]; intercept = W[-1]
print(coef_)

[-13029.90392686    256.85635254    339.19345361    475.50054515
  23848.53454191    131.3143594     -74.97105809    607.08709188]
