### Linear Regression - Medical Insurence

In [22]:
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import shapiro, kstest, normaltest,skew
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

import warnings
warnings.filterwarnings('ignore')

!pip install statsmodels

!pip install matplotlib

!pip install seaborn

### 1. PRoblem Statement

### 2. Data Gathering

In [9]:
df_co2 = pd.read_csv(r'data/CO2.csv')

df_co2

Unnamed: 0,Plant,Type,Treatment,conc,uptake
0,Qn1,Quebec,nonchilled,95,16.0
1,Qn1,Quebec,nonchilled,175,30.4
2,Qn1,Quebec,nonchilled,250,34.8
3,Qn1,Quebec,nonchilled,350,37.2
4,Qn1,Quebec,nonchilled,500,35.3
...,...,...,...,...,...
79,Mc3,Mississippi,chilled,250,17.9
80,Mc3,Mississippi,chilled,350,17.9
81,Mc3,Mississippi,chilled,500,17.9
82,Mc3,Mississippi,chilled,675,18.9


In [10]:
df_co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Plant      84 non-null     object 
 1   Type       84 non-null     object 
 2   Treatment  84 non-null     object 
 3   conc       84 non-null     int64  
 4   uptake     84 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 3.4+ KB


In [11]:
df_co2.isna().sum()

Plant        0
Type         0
Treatment    0
conc         0
uptake       0
dtype: int64

### 3. EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [12]:
df_co2['Type'].value_counts()

Type
Quebec         42
Mississippi    42
Name: count, dtype: int64

In [13]:
df_co2['Type'].replace({'Quebec':1, 'Mississippi':0 }, inplace=True)

In [14]:
df_co2['Treatment'].value_counts()

Treatment
nonchilled    42
chilled       42
Name: count, dtype: int64

In [15]:
df_co2.replace({'nonchilled':0, 'chilled':1}, inplace=True)

In [16]:
df_encoded = pd.get_dummies(df_co2, columns = ['Plant'], dtype = int, prefix = 'Plant', drop_first=True)
df_encoded

Unnamed: 0,Type,Treatment,conc,uptake,Plant_Mc2,Plant_Mc3,Plant_Mn1,Plant_Mn2,Plant_Mn3,Plant_Qc1,Plant_Qc2,Plant_Qc3,Plant_Qn1,Plant_Qn2,Plant_Qn3
0,1,0,95,16.0,0,0,0,0,0,0,0,0,1,0,0
1,1,0,175,30.4,0,0,0,0,0,0,0,0,1,0,0
2,1,0,250,34.8,0,0,0,0,0,0,0,0,1,0,0
3,1,0,350,37.2,0,0,0,0,0,0,0,0,1,0,0
4,1,0,500,35.3,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,0,1,250,17.9,0,1,0,0,0,0,0,0,0,0,0
80,0,1,350,17.9,0,1,0,0,0,0,0,0,0,0,0
81,0,1,500,17.9,0,1,0,0,0,0,0,0,0,0,0
82,0,1,675,18.9,0,1,0,0,0,0,0,0,0,0,0


In [17]:
df_combined = pd.concat([df_co2.drop('Plant', axis = 1), df_encoded], axis=1)
df_combined

Unnamed: 0,Type,Treatment,conc,uptake,Type.1,Treatment.1,conc.1,uptake.1,Plant_Mc2,Plant_Mc3,Plant_Mn1,Plant_Mn2,Plant_Mn3,Plant_Qc1,Plant_Qc2,Plant_Qc3,Plant_Qn1,Plant_Qn2,Plant_Qn3
0,1,0,95,16.0,1,0,95,16.0,0,0,0,0,0,0,0,0,1,0,0
1,1,0,175,30.4,1,0,175,30.4,0,0,0,0,0,0,0,0,1,0,0
2,1,0,250,34.8,1,0,250,34.8,0,0,0,0,0,0,0,0,1,0,0
3,1,0,350,37.2,1,0,350,37.2,0,0,0,0,0,0,0,0,1,0,0
4,1,0,500,35.3,1,0,500,35.3,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,0,1,250,17.9,0,1,250,17.9,0,1,0,0,0,0,0,0,0,0,0
80,0,1,350,17.9,0,1,350,17.9,0,1,0,0,0,0,0,0,0,0,0
81,0,1,500,17.9,0,1,500,17.9,0,1,0,0,0,0,0,0,0,0,0
82,0,1,675,18.9,0,1,675,18.9,0,1,0,0,0,0,0,0,0,0,0


In [18]:
df_co2

Unnamed: 0,Plant,Type,Treatment,conc,uptake
0,Qn1,1,0,95,16.0
1,Qn1,1,0,175,30.4
2,Qn1,1,0,250,34.8
3,Qn1,1,0,350,37.2
4,Qn1,1,0,500,35.3
...,...,...,...,...,...
79,Mc3,0,1,250,17.9
80,Mc3,0,1,350,17.9
81,Mc3,0,1,500,17.9
82,Mc3,0,1,675,18.9


In [19]:
df_co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Plant      84 non-null     object 
 1   Type       84 non-null     int64  
 2   Treatment  84 non-null     int64  
 3   conc       84 non-null     int64  
 4   uptake     84 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 3.4+ KB


In [20]:
x = df_combined.drop('uptake', axis =1)
y = df_combined.uptake

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=1, test_size= 0.2)
x_train.shape
df_combined

Unnamed: 0,Type,Treatment,conc,uptake,Type.1,Treatment.1,conc.1,uptake.1,Plant_Mc2,Plant_Mc3,Plant_Mn1,Plant_Mn2,Plant_Mn3,Plant_Qc1,Plant_Qc2,Plant_Qc3,Plant_Qn1,Plant_Qn2,Plant_Qn3
0,1,0,95,16.0,1,0,95,16.0,0,0,0,0,0,0,0,0,1,0,0
1,1,0,175,30.4,1,0,175,30.4,0,0,0,0,0,0,0,0,1,0,0
2,1,0,250,34.8,1,0,250,34.8,0,0,0,0,0,0,0,0,1,0,0
3,1,0,350,37.2,1,0,350,37.2,0,0,0,0,0,0,0,0,1,0,0
4,1,0,500,35.3,1,0,500,35.3,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,0,1,250,17.9,0,1,250,17.9,0,1,0,0,0,0,0,0,0,0,0
80,0,1,350,17.9,0,1,350,17.9,0,1,0,0,0,0,0,0,0,0,0
81,0,1,500,17.9,0,1,500,17.9,0,1,0,0,0,0,0,0,0,0,0
82,0,1,675,18.9,0,1,675,18.9,0,1,0,0,0,0,0,0,0,0,0


In [23]:
rf_reg = RandomForestRegressor(random_state=1)
rf_reg.fit(x_train, y_train)

In [24]:
rf_reg.feature_importances_

array([0.20184372, 0.04733904, 0.2450652 , 0.13892525, 0.06202529,
       0.20966579, 0.04403458, 0.00387539, 0.00081778, 0.00559994,
       0.00280676, 0.01192355, 0.0023249 , 0.0041089 , 0.00358298,
       0.00213024, 0.01393069])

In [25]:
y_pred_train = rf_reg.predict(x_train)
y_pred_train[50]

array([11.64, 11.64])

In [26]:
y_pred_train = rf_reg.predict(x_train)       #train data Evaluation

mse = mean_squared_error(y_train, y_pred_train)
print("MSE for ranf_training :",mse)

rmse = np.sqrt(mse)
print("RMSE ranf_training :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print('MAE ranf_training :',mae)

r2_value = r2_score(y_train, y_pred_train)
print("R-squared Value ranf_training :",r2_value)

MSE for ranf_training : 0.8265828358208923
RMSE ranf_training : 0.9091660111447701
MAE ranf_training : 0.7182388059701488
R-squared Value ranf_training : 0.9930918979321328


In [27]:
# Test Data Evaluation
y_pred = rf_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE ranf_testing :",mse)

rmse = np.sqrt(mse)
print("RMSE ranf_testing:",rmse)

mae = mean_absolute_error(y_test, y_pred)
print('MAE ranf_testing :',mae)

r2_value = r2_score(y_test, y_pred)
print("R-squared Value ranf_testing :",r2_value)

MSE ranf_testing : 4.001520823529393
RMSE ranf_testing: 2.0003801697500885
MAE ranf_testing : 1.640235294117641
R-squared Value ranf_testing : 0.9596144182495804


### Model Training

In [29]:
with open("random_forest_model.pkl", 'wb') as f:
    pickle.dump(rf_reg,f)

In [30]:
df_co2

Unnamed: 0,Plant,Type,Treatment,conc,uptake
0,Qn1,1,0,95,16.0
1,Qn1,1,0,175,30.4
2,Qn1,1,0,250,34.8
3,Qn1,1,0,350,37.2
4,Qn1,1,0,500,35.3
...,...,...,...,...,...
79,Mc3,0,1,250,17.9
80,Mc3,0,1,350,17.9
81,Mc3,0,1,500,17.9
82,Mc3,0,1,675,18.9


df1

In [31]:
column_data = {
                "type": {'Quebec':1, 'Mississippi':0},
                "treatment" : {'nonchilled ':1, 'chilled ':0}
                }
with open("column_data.json", 'w') as f:
    json.dump(column_data,f)