In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from scipy.stats import skew
import json
import pickle
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("medical_insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df["sex"].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [7]:
df["sex"].replace({"female":0,"male":1},inplace = True)

In [8]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,yes,southwest,16884.92400
1,18,1,33.770,1,no,southeast,1725.55230
2,28,1,33.000,3,no,southeast,4449.46200
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,no,northwest,10600.54830
1334,18,0,31.920,0,no,northeast,2205.98080
1335,18,0,36.850,0,no,southeast,1629.83350
1336,21,0,25.800,0,no,southwest,2007.94500


In [9]:

df["smoker"].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [10]:
df["smoker"].replace({"no":1,"yes":0},inplace=True)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,0,southwest,16884.92400
1,18,1,33.770,1,1,southeast,1725.55230
2,28,1,33.000,3,1,southeast,4449.46200
3,33,1,22.705,0,1,northwest,21984.47061
4,32,1,28.880,0,1,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,1,northwest,10600.54830
1334,18,0,31.920,0,1,northeast,2205.98080
1335,18,0,36.850,0,1,southeast,1629.83350
1336,21,0,25.800,0,1,southwest,2007.94500


In [11]:
df["region"].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [12]:
df = pd.get_dummies(df,columns=["region"])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   uint8  
 7   region_northwest  1338 non-null   uint8  
 8   region_southeast  1338 non-null   uint8  
 9   region_southwest  1338 non-null   uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 68.1 KB


In [14]:
df = df.select_dtypes(exclude=object)
x = df.drop("charges",axis = 1) 
y = df["charges"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=34)

In [15]:
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
414,19,0,35.150,0,1,0,1,0,0
1279,25,0,26.790,2,1,0,1,0,0
647,40,0,23.370,3,1,1,0,0,0
764,45,0,25.175,2,1,1,0,0,0
1133,52,0,18.335,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
453,20,1,29.735,0,1,0,1,0,0
324,29,1,27.200,0,1,0,0,0,1
1109,45,1,20.350,3,1,0,0,1,0
490,19,0,32.900,0,1,0,0,0,1


In [16]:
model = LinearRegression()
model.fit(x_train,y_train)

In [17]:
#Testing Data
y_pred = model.predict(x_test)


mse = mean_squared_error(y_test,y_pred)
print("mean squared error is :",mse)

rmse = np.sqrt(mse)
print("Root mean squaed error is :",rmse)

mae = mean_absolute_error(y_test,y_pred)
print("Mean absolute error is:",mae)

r2_scores = r2_score(y_test,y_pred)
print("R2_score is ",r2_scores)




mean squared error is : 41587655.82548301
Root mean squaed error is : 6448.849186132593
Mean absolute error is: 4420.543383291356
R2_score is  0.7442111491697808


In [18]:
#Training Data
y_tain_prob = model.predict(x_train)

mse = mean_squared_error(y_train,y_tain_prob)
print("Mean sqquared error is :",mse)



Mean sqquared error is : 35291933.482411794


In [19]:
with open("model.pickel","wb") as f:
    pickle.dump(model,f)

In [20]:
x.head(1).T #T attribute is used to transpose a DataFrame. 

Unnamed: 0,0
age,19.0
sex,0.0
bmi,27.9
children,0.0
smoker,0.0
region_northeast,0.0
region_northwest,0.0
region_southeast,0.0
region_southwest,1.0


In [21]:

age = 19.0
sex = "male" 
bmi=27.9
children = 2
smoker = "yes"

region   = "southeast"

In [22]:
len(x.columns)

9

In [23]:
test_array = np.zeros(len(x_train.columns))

In [24]:
test_array

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [25]:
test_array[0] = age
# test_arraysex[1]=sex
test_array[2]= bmi
test_array[3]= children
# test_array[4] = smoker

# test_array[5] = "region" + region
# test_array[6] = "region" + region
# test_array[7] = "region" + region
# test_array[8] = "region" + region


In [26]:
test_array

array([19. ,  0. , 27.9,  2. ,  0. ,  0. ,  0. ,  0. ,  0. ])

In [27]:
 label_encoding ={"sex" :{"female":0,"male":1},"smoker":{"no":1,"yes":0}}

In [28]:
test_array[0] = age
test_array[1]= label_encoding["sex"][sex]
test_array[2]= bmi
test_array[3]= children
test_array[4] = label_encoding["smoker"][smoker]




In [29]:
region = "region_"+ region

In [30]:

region

'region_southeast'

In [31]:
colunms = x.columns
colunms

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [32]:
np.where(colunms == region)

(array([7], dtype=int64),)

In [33]:
np.where(colunms == region)[0]

array([7], dtype=int64)

In [34]:
region_index = np.where(colunms == region)[0][0]

In [35]:
test_array[0] = age
test_array[1]= label_encoding["sex"][sex]
test_array[2]= bmi
test_array[3]= children
test_array[4] = label_encoding["smoker"][smoker]
test_array[region_index]= 1 

In [36]:
test_array

array([19. ,  1. , 27.9,  2. ,  0. ,  0. ,  0. ,  1. ,  0. ])

In [37]:
model.predict([test_array])



array([25638.68299908])

Data types are the classification or categorization of data items. It represents the kind of value that tells what operations can be performed on a particular data. Since everything is an object in Python programming, data types are actually classes and variables are instances (object) of these classes.

In [40]:
project_data ={"sex" :{"female":0,"male":1},"smoker":{"no":1,"yes":0},"columns":list(x.columns)}

In [41]:
project_data

{'sex': {'female': 0, 'male': 1},
 'smoker': {'no': 1, 'yes': 0},
 'columns': ['age',
  'sex',
  'bmi',
  'children',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}

In [42]:
with open("project_data.json","w") as f:
    json.dump(project_data,f)