In [1]:
import numpy as np
import pandas as pd
from model import LinearRegression

In [3]:
df = pd.read_csv('./data/insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
# sex
sex_dummies = pd.get_dummies(df['sex'])
df = pd.concat([df, sex_dummies], axis=1)
df = df.drop(['sex'], axis=1)

# smoker
smoker_dummies = pd.get_dummies(df['smoker'])
df = pd.concat([df, smoker_dummies], axis=1)
df = df.drop(['smoker'], axis=1)

# region
region_dummies = pd.get_dummies(df['region'])
df = pd.concat([df, region_dummies], axis=1)
df = df.drop(['region'], axis=1)

df

Unnamed: 0,age,bmi,children,charges,female,male,no,yes,northeast,northwest,southeast,southwest
0,19,27.900,0,16884.92400,1,0,0,1,0,0,0,1
1,18,33.770,1,1725.55230,0,1,1,0,0,0,1,0
2,28,33.000,3,4449.46200,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.880,0,3866.85520,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,1,1,0,0,1,0,0
1334,18,31.920,0,2205.98080,1,0,1,0,1,0,0,0
1335,18,36.850,0,1629.83350,1,0,1,0,0,0,1,0
1336,21,25.800,0,2007.94500,1,0,1,0,0,0,0,1


In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,age,bmi,children,charges,female,male,no,yes,northeast,northwest,southeast,southwest
0,51,31.635,0,9174.13565,0,1,1,0,0,1,0,0
1,40,25.080,0,5415.66120,0,1,1,0,0,0,1,0
2,31,31.065,3,5425.02335,0,1,1,0,0,1,0,0
3,23,23.180,2,14426.07385,1,0,1,0,0,1,0,0
4,49,27.170,0,8601.32930,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,63,31.445,0,13974.45555,0,1,1,0,1,0,0,0
1334,24,29.300,0,1977.81500,0,1,1,0,0,0,0,1
1335,59,34.800,2,36910.60803,1,0,1,0,0,0,0,1
1336,50,28.160,3,10702.64240,1,0,1,0,0,0,1,0


In [6]:
X = df.drop(['charges'], axis=1)
X

Unnamed: 0,age,bmi,children,female,male,no,yes,northeast,northwest,southeast,southwest
0,51,31.635,0,0,1,1,0,0,1,0,0
1,40,25.080,0,0,1,1,0,0,0,1,0
2,31,31.065,3,0,1,1,0,0,1,0,0
3,23,23.180,2,1,0,1,0,0,1,0,0
4,49,27.170,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1333,63,31.445,0,0,1,1,0,1,0,0,0
1334,24,29.300,0,0,1,1,0,0,0,0,1
1335,59,34.800,2,1,0,1,0,0,0,0,1
1336,50,28.160,3,1,0,1,0,0,0,1,0


In [7]:
y = df[['charges']]
y

Unnamed: 0,charges
0,9174.13565
1,5415.66120
2,5425.02335
3,14426.07385
4,8601.32930
...,...
1333,13974.45555
1334,1977.81500
1335,36910.60803
1336,10702.64240


In [8]:
X.to_csv('./data/preprocessed_X.csv', index=False)
y.to_csv('./data/preprocessed_y.csv', index=False)
X = open('./data/preprocessed_X.csv', 'r').readlines()
y = open('./data/preprocessed_y.csv', 'r').readlines()

In [9]:
# reserve 20% for testing, split the rest into 4 parts
num_rows = len(X)
test_data_size = (num_rows - 1) // 5
open('./data/test_X.csv', 'w+').writelines(X[1:test_data_size+1])
open('./data/test_y.csv', 'w+').writelines(y[1:test_data_size+1])

In [11]:
filename = 0
train_data_size=(num_rows-test_data_size-1)//4
for i in range(test_data_size+1, num_rows, train_data_size):
    if i + train_data_size > num_rows:
        break
    open('./data4/train_X_' + str(filename) + '.csv', 'w+').writelines(X[i:i+train_data_size])
    open('./data4/train_y_' + str(filename) + '.csv', 'w+').writelines(y[i:i+train_data_size])
    filename += 1

In [12]:
filename = 0
train_data_size=(num_rows-test_data_size-1)//5
for i in range(test_data_size+1, num_rows, train_data_size):
    if i + train_data_size > num_rows:
        break
    open('./data5/train_X_' + str(filename) + '.csv', 'w+').writelines(X[i:i+train_data_size])
    open('./data5/train_y_' + str(filename) + '.csv', 'w+').writelines(y[i:i+train_data_size])
    filename += 1