# Modeling

## Simple Linear Regression

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from instools import utils

In [2]:
ins = pd.read_csv("data/insurance.csv")
ins

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
ins_lm = pd.get_dummies(ins)
ins_lm

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,1,0,0,1,0,0,0,1
1,18,33.770,1,1725.55230,0,1,1,0,0,0,1,0
2,28,33.000,3,4449.46200,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.880,0,3866.85520,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,1,1,0,0,1,0,0
1334,18,31.920,0,2205.98080,1,0,1,0,1,0,0,0
1335,18,36.850,0,1629.83350,1,0,1,0,0,0,1,0
1336,21,25.800,0,2007.94500,1,0,1,0,0,0,0,1


In [4]:
ins_lms = ins_lm.drop(['sex_female', 'smoker_no', 'region_southwest'], axis = 1)

In [5]:
x = ins_lms[['age', 'bmi', 'children', 'smoker_yes', 'region_northeast', 'region_northwest', 'region_southeast']]
y= ins_lms['charges']

In [6]:
x = sm.add_constant(x)

In [7]:
results = sm.OLS(y,x).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.750
Method:                 Least Squares   F-statistic:                     572.7
Date:                Wed, 10 May 2023   Prob (F-statistic):               0.00
Time:                        00:48:03   Log-Likelihood:                -13548.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1330   BIC:                         2.715e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const            -1.295e+04   1012.403  

The results show that age, BMI, children, and smoking, and some regions can all significantly affect health insurance charges.

## More Modeling with Classification Techniques

In [8]:
ins_mod = ins.copy()
ins_mod.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [9]:
#change categorical variables into numerical ones
ins_mod = utils.label_encoder(ins_mod, ['sex', 'smoker', 'region'])

In [10]:
#converts 'charges' column into 10 quantiles in order to be 'grouped' so that it can be classified
new_charges_col = pd.qcut(ins_mod['charges'], 10, labels=False)
charge_df = ins_mod.copy()
charge_df['charges'] = new_charges_col

In [11]:
#accuracy in classifying which 'charges' group the data belongs to using the Decision Tree
x_test = ['age','sex', 'bmi', 'children','smoker', 'region']
y_test = 'charges'
dt = utils.model_accuracy(DecisionTreeClassifier(), charge_df, x_test, y_test)

DecisionTreeClassifier Accuracy estimating charges
Training Accuracy: 1.0
Testing Accuracy: 0.7238805970149254


In [12]:
#accuracy in predicting male or female using other categories using knn classifiers
g_df = ins_mod.copy()
knn= utils.model_accuracy(KNeighborsClassifier(), g_df,['age','charges', 'bmi', 'children','smoker', 'region'], 'sex')

KNeighborsClassifier Accuracy estimating sex
Training Accuracy: 0.7654205607476635
Testing Accuracy: 0.6380597014925373
