## Day 6 Intro to Linear Regression

In [3]:
import pandas as pd
import statsmodels.api as sm

df = pd.read_csv('../data/insurance.csv')

df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
df.corr(numeric_only=True)

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [5]:
y = df['charges']

X = df[['age', 'bmi', 'children']] # Manual way to get numeric columns

X= df.select_dtypes(include='number').assign(const=1) # Automatic way to get numeric columns
X.drop(columns=['charges'], inplace=True)

X.head()

Unnamed: 0,age,bmi,children,const
0,19,27.9,0,1
1,18,33.77,1,1
2,28,33.0,3,1
3,33,22.705,0,1
4,32,28.88,0,1


In [6]:
model = sm.OLS(y, X).fit() # Ordinary Least Squares
results = model.summary()
print(results)

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     60.69
Date:                Thu, 01 Feb 2024   Prob (F-statistic):           8.80e-37
Time:                        08:48:55   Log-Likelihood:                -14392.
No. Observations:                1338   AIC:                         2.879e+04
Df Residuals:                    1334   BIC:                         2.881e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
age          239.9945     22.289     10.767      0.0

In [7]:
df_dummy = pd.get_dummies(
    df, columns=["sex", "smoker", "region"], drop_first=True
).astype(int)

df_dummy = pd.get_dummies(
    columns=df.select_dtypes(include='object').columns, data=df, drop_first=True
    ).astype(int) # Automatic way to get categorical columns

df_dummy.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,0,1,0,0,1
1,18,33,1,1725,1,0,0,1,0
2,28,33,3,4449,1,0,0,1,0
3,33,22,0,21984,1,0,1,0,0
4,32,28,0,3866,1,0,1,0,0


In [8]:
y = df_dummy['charges']
X = df_dummy.drop(columns=['charges']).assign(const=1) # Automatic way to get numeric columns


model = sm.OLS(y, X).fit() # Ordinary Least Squares

results = model.summary()
print(results)


                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.7
Date:                Thu, 01 Feb 2024   Prob (F-statistic):               0.00
Time:                        08:48:55   Log-Likelihood:                -13548.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1329   BIC:                         2.716e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
age                256.9773     11.898  

In [10]:
# Region: northwest
# Age: 38
# BMI: 35
# Children: 3
# Smoker: no
# Sex: male

charges = (
    38 * 256.9773
    + 35 * 337.9905
    + 3 * 478.2346
    + 1 * -132.4209
    + 1 * -346.4653
    + -1.174e04
)
print(charges)

print(model.predict([38, 35, 3, 1, 0, 1, 0, 0, 0]))

10810.622499999998
[22550.62081335]
