In [20]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
sns.set(style='white', palette='muted', color_codes=True)
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import plotly.express as px

from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [21]:
data = pd.read_csv(r'c:\Github\Fullstack-Data-Analyst\Learning\the_data\data-lab-3-insurance.csv')
# data.info()
# data.head(1)

In [22]:
data.duplicated().sum()
data.drop_duplicates(subset=None, keep='first', ignore_index=False, inplace=True)
data.duplicated().sum()

0

In [23]:
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# CREATE VARIABLE FOR DICTIONARY

In [24]:
#Assign new variables for the columns to be encoded
sex_val = data['sex']
smok_val = data['smoker']
reg_val = data['region']
# ori_data = data['sex', 'smoker', 'region']


#ENCODE THE DATA AND CREATE A DICTIONARY

In [25]:
cat_val = ['sex', 'smoker', 'region']
data[cat_val] = data[cat_val].apply(LabelEncoder().fit_transform)
data.head(1)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924


In [26]:
le_sex = data['sex']

In [27]:
ori_data_dic = np.unique(sex_val)
le_data_dic = np.unique(le_sex)
data_dic = dict(zip(ori_data_dic, le_data_dic))
data_dic

{'female': 0, 'male': 1}

In [28]:
X = data.drop(columns=['charges'], axis=1)
y = data['charges']


In [29]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,3
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,3


# SCALE THE DATA

In [30]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(np.array(y).reshape(-1,1))
X


array([[-1.44041773, -1.00977099, -0.45315959, -0.90923416,  1.96966039,
         1.34316271],
       [-1.51164747,  0.99032355,  0.50942165, -0.07944162, -0.50770174,
         0.43801727],
       [-0.79935006,  0.99032355,  0.3831546 ,  1.58014347, -0.50770174,
         0.43801727],
       ...,
       [-1.51164747, -1.00977099,  1.01448983, -0.90923416, -0.50770174,
         0.43801727],
       [-1.29795825, -1.00977099, -0.79752426, -0.90923416, -0.50770174,
         1.34316271],
       [ 1.55123139, -1.00977099, -0.26129928, -0.90923416,  1.96966039,
        -0.46712816]])

#SPLIT THE DATA

In [31]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

# LINEAR REGRESSION MODEL -- COZ Y IS NUMBERS FROM ORIGINAL DATA

In [32]:
#Assign Model to use
linreg = LinearRegression()
linreg.fit(x_train, y_train)

#Predict the model
y_train_pred = linreg.predict(x_train)
y_test_pred = linreg.predict(x_test)

#Measure performance of model
lin_train_r2 = r2_score(y_train, y_train_pred)
lin_test_r2 = r2_score(y_test, y_test_pred)
lin_train_mse = mean_squared_error(y_train,y_train_pred)
lin_test_mse = mean_squared_error(y_test, y_test_pred)

In [33]:
model_perf = pd.DataFrame([ 
    'LinearRegression', lin_train_r2, lin_test_r2, lin_train_mse, lin_test_mse
]).transpose()

model_perf.columns = ['Method', 'Training R2', 'Test R2', 'Training MSE', 'Test MSE']
model_perf


Unnamed: 0,Method,Training R2,Test R2,Training MSE,Test MSE
0,LinearRegression,0.742396,0.763172,0.244325,0.264637


# RANDOM FOREST REGRESSION 

In [34]:
forest = RandomForestRegressor(n_estimators = 100, criterion = 'mse', random_state = 1, n_jobs = -1)

forest.fit(x_train,y_train) # The actual training
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test)

forest_mse_train = mean_squared_error(y_train,forest_train_pred)
forest_mse_test = mean_squared_error(y_test,forest_test_pred)
forest_r2_train = r2_score(y_train,forest_train_pred)
forest_r2_test =  r2_score(y_test,forest_test_pred)

  forest.fit(x_train,y_train) # The actual training
  warn(


In [35]:
forest_model_perf = pd.DataFrame([ 
    'RandomForestRegressor', forest_mse_train, forest_mse_test, forest_r2_train, forest_r2_test
]).transpose()


forest_model_perf.columns = ['Method', 'Forest Mse Train', 'Forest Mse Test', 'Forest R2 Train', 'Forest R2 Test']
forest_model_perf

Unnamed: 0,Method,Forest Mse Train,Forest Mse Test,Forest R2 Train,Forest R2 Test
0,RandomForestRegressor,0.02407,0.164888,0.974622,0.852439
