In [8]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
sns.set(style='white', palette='muted', color_codes=True)
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import plotly.express as px

from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


#Load the Data
data = pd.read_csv(r'c:\Github\Fullstack-Data-Analyst\Learning\the_data\data-lab-3-insurance.csv')

#Clean the Data
data.duplicated().sum()
data.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False) #subset None or subset=['colname', 'colname2'], 
data.duplicated().sum()

mis_col = data.isna().sum() * 100/len(data)
mis_col.sort_values(ascending=False)

data.to_csv('practice_data.csv')

                                          #Encode Data with a dictionary

#Assign new variables for the columns to be encoded
sex_val = data['sex']
smok_val = data['smoker']
reg_val = data['region']

#Encode the data as needed
cat_val = ['sex', 'smoker', 'region']
data[cat_val] = data[cat_val].apply(LabelEncoder().fit_transform)
data.head(1)

#Create a dictionary
le_sex = data['sex']

ori_data_dic = np.unique(sex_val)
le_data_dic = np.unique(le_sex)
data_dic = dict(zip(ori_data_dic, le_data_dic))
data_dic

                                # EDA
#Data Correlation function
data.corr()['charges'].sort_values()

#Data Correlation visualization
f,ax = plt.subplots(figsize=(10,8))
corr = data.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10, as_cmap=True), square=True, ax=ax);

#Column Distribution
sns.histplot(data['charges'], color='c')

#How values in one column correlate to the Y variable
# {'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3}
f, ax = plt.subplots(figsize=(10,8))

sns.histplot(data[(data.region == 0)]['charges'], color='y', ax=ax)
ax.set_title('Northwest charges count')


                                    #MODEL CREATION
#Assign features and target
X = data.drop(columns=['charges'], axis=1)
y = data['charges']

#Scale the Data
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(np.array(y).reshape(-1,1))

#Split The Data
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

            #Create Model
#Assign Model to use
linreg = LinearRegression()
linreg.fit(x_train, y_train)

#Predict the model
y_train_pred = linreg.predict(x_train)
y_test_pred = linreg.predict(x_test)

#Measure performance of model
lin_train_r2 = r2_score(y_train, y_train_pred)
lin_test_r2 = r2_score(y_test, y_test_pred)
lin_train_mse = mean_squared_error(y_train,y_train_pred)
lin_test_mse = mean_squared_error(y_test, y_test_pred)

#Print Model performance results
model_perf = pd.DataFrame([ 
    'LinearRegression', lin_train_r2, lin_test_r2, lin_train_mse, lin_test_mse
]).transpose()

model_perf.columns = ['Method', 'Training R2', 'Test R2', 'Training MSE', 'Test MSE']
model_perf

#Repeat Same Thing With Random Forest
forest = RandomForestRegressor(n_estimators = 100, criterion = 'mse', random_state = 1, n_jobs = -1)

forest.fit(x_train,y_train) # The actual training
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test)

forest_mse_train = mean_squared_error(y_train,forest_train_pred)
forest_mse_test = mean_squared_error(y_test,forest_test_pred)
forest_r2_train = r2_score(y_train,forest_train_pred)
forest_r2_test =  r2_score(y_test,forest_test_pred)

#Print the performance
forest_model_perf = pd.DataFrame([ 
    'RandomForestRegressor', forest_mse_train, forest_mse_test, forest_r2_train, forest_r2_test
]).transpose()

forest_model_perf.columns = ['Method', 'Forest Mse Train', 'Forest Mse Test', 'Forest R2 Train', 'Forest R2 Test']
forest_model_perf

0