In [205]:
# Importing the libraries.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pickle 

In [207]:
# is used to read a CSV file into a Pandas DataFrame.
dataset = pd.read_csv("insurance.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
97,55,male,38.280,0,no,southeast,10226.28420
98,56,male,19.950,0,yes,northeast,22412.64850
99,38,male,19.300,0,yes,southwest,15820.69900
100,58,female,31.600,0,no,southwest,6186.12700


In [209]:
# convert categorical variables to numerical variables
dataset = pd.get_dummies(dataset, drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,False,True,False,False,True
1,18,33.770,1,1725.55230,True,False,False,True,False
2,28,33.000,3,4449.46200,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.880,0,3866.85520,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...
97,55,38.280,0,10226.28420,True,False,False,True,False
98,56,19.950,0,22412.64850,True,True,False,False,False
99,38,19.300,0,15820.69900,True,True,False,False,True
100,58,31.600,0,6186.12700,False,False,False,False,True


In [211]:
# list of column names in the dataset.
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [213]:
# The datasets are assigned to new variables. 
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes',
       'region_northwest', 'region_southeast', 'region_southwest']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,False,True,False,False,True
1,18,33.770,1,True,False,False,True,False
2,28,33.000,3,True,False,False,True,False
3,33,22.705,0,True,False,True,False,False
4,32,28.880,0,True,False,True,False,False
...,...,...,...,...,...,...,...,...
97,55,38.280,0,True,False,False,True,False
98,56,19.950,0,True,True,False,False,False
99,38,19.300,0,True,True,False,False,True
100,58,31.600,0,False,False,False,False,True


In [215]:
# The datasets are assigned to new variables. 
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
97,10226.28420
98,22412.64850
99,15820.69900
100,6186.12700


In [217]:
# splitting a dataset into training and testing sets.
x_train,x_test,y_train,y_test = train_test_split(independent,dependent, test_size = 0.20,random_state = 0)

In [219]:
regressor = LinearRegression()
regressor.fit(x_train,y_train)

In [221]:
# stores the model coefficients learned in weight.
weight = regressor.coef_
weight

array([[  271.37401574,   500.83363381,   149.32267316,  3078.23525924,
        23429.30351539,  2293.41909319,  -899.55233498,   486.1400911 ]])

In [223]:
# It stores the model's intercept in bias.
bias = regressor.intercept_
bias

array([-20091.80986752])

In [225]:
# It makes predictions on 'x_test' and stores them in 'y_pred'.
y_pred = regressor.predict(x_test)
y_pred

array([[ 8566.49756085],
       [37814.63115205],
       [ 6660.82343277],
       [43255.79319721],
       [ 9151.92468357],
       [31592.1281244 ],
       [ 9584.62087299],
       [16829.38561224],
       [ 7873.45482891],
       [ 7988.76763482],
       [32529.34315097],
       [37166.30883116],
       [14007.5732613 ],
       [14148.77797745],
       [ 6583.54082956],
       [30701.77470819],
       [ 4050.03225302],
       [ 9655.19100127],
       [16555.00764994],
       [ 8265.77661711],
       [ 4773.79764768]])

In [227]:
# It calculates the R-squared score between 'y_test' and 'y_pred'.
r_score = r2_score(y_test,y_pred)
r_score

0.9112501559417566

In [229]:
# file name for saving a model.
mymodel = "finalized_model_linear.sav"

In [231]:
# It saves the 'regressor' model to a file using pickle.
pickle.dump(regressor, open (mymodel,'wb'))

In [233]:
# It loads the saved model from the file using pickle.
loaded_model = pickle.load (open ("finalized_model_linear.sav",'rb'))
loaded_model

In [235]:
# It makes a prediction of input values.
result= loaded_model.predict([[25,33,3,1,1,0,1,0]])



In [237]:
# stores the prediction result.
result

array([[29276.00490093]])