In [None]:
# Use Case: Predicting the Profit of Insurance company using disfferent Policy Type
# Model used: Multiple Linear Regression 

# Importing the libraries and dataset¶ 

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
ds = pd.read_excel('Insurance_Sample_Data.xlsx')
X = ds.iloc[:, :-1].values
Y = ds.iloc[:, -1].values
print(ds)

            Policy Type  Premium Amount  Claims Amount  Number of Policies  \
0        Auto Insurance            1200            300                 100   
1        Home Insurance            1500            500                  80   
2        Life Insurance            2000            200                  50   
3      Health Insurance            2500            600                  70   
4      Travel Insurance             800            100                 120   
5  Commercial Insurance            3000           1000                  30   
6         Pet Insurance             600             50                 200   

   Customer Satisfaction Score  Profit (in $)  
0                          4.5            900  
1                          4.0           1000  
2                          4.8           1800  
3                          3.5           1900  
4                          4.2            700  
5                          4.6           2000  
6                          4.1         

In [42]:
print(X)

[['Auto Insurance' 1200 300 100 4.5]
 ['Home Insurance' 1500 500 80 4.0]
 ['Life Insurance' 2000 200 50 4.8]
 ['Health Insurance' 2500 600 70 3.5]
 ['Travel Insurance' 800 100 120 4.2]
 ['Commercial Insurance' 3000 1000 30 4.6]
 ['Pet Insurance' 600 50 200 4.1]]


In [43]:
print(Y)

[ 900 1000 1800 1900  700 2000  550]


# Tacking care of missing data
if we have large dataset and 1% of missing data we can just ignore it
or update missing value with avg of salary

In [44]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X[:,1:-1])
X[:, 1:-1] = imputer.transform(X[:,1:-1])
print(X)

[['Auto Insurance' 1200.0 300.0 100.0 4.5]
 ['Home Insurance' 1500.0 500.0 80.0 4.0]
 ['Life Insurance' 2000.0 200.0 50.0 4.8]
 ['Health Insurance' 2500.0 600.0 70.0 3.5]
 ['Travel Insurance' 800.0 100.0 120.0 4.2]
 ['Commercial Insurance' 3000.0 1000.0 30.0 4.6]
 ['Pet Insurance' 600.0 50.0 200.0 4.1]]


# Encoding Categorical Data 
in features any categorical data(i.e string value) is present we have to convert it to number format.

In [46]:
#Encoding Independent variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(), [0])] , remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[1.0 0.0 0.0 0.0 0.0 0.0 0.0 1200.0 300.0 100.0 4.5]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 1500.0 500.0 80.0 4.0]
 [0.0 0.0 0.0 0.0 1.0 0.0 0.0 2000.0 200.0 50.0 4.8]
 [0.0 0.0 1.0 0.0 0.0 0.0 0.0 2500.0 600.0 70.0 3.5]
 [0.0 0.0 0.0 0.0 0.0 0.0 1.0 800.0 100.0 120.0 4.2]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 3000.0 1000.0 30.0 4.6]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 600.0 50.0 200.0 4.1]]


# Splitting the dataset into the Training and Test set

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)
print(X_train)

[[0.0 0.0 0.0 1.0 0.0 0.0 0.0 1500.0 500.0 80.0 4.0]
 [1.0 0.0 0.0 0.0 0.0 0.0 0.0 1200.0 300.0 100.0 4.5]
 [0.0 0.0 0.0 0.0 0.0 0.0 1.0 800.0 100.0 120.0 4.2]
 [0.0 0.0 1.0 0.0 0.0 0.0 0.0 2500.0 600.0 70.0 3.5]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 3000.0 1000.0 30.0 4.6]]


In [48]:
print(X_test)

[[0.0 0.0 0.0 0.0 0.0 1.0 0.0 600.0 50.0 200.0 4.1]
 [0.0 0.0 0.0 0.0 1.0 0.0 0.0 2000.0 200.0 50.0 4.8]]


In [49]:
print(Y_train)

[1000  900  700 1900 2000]


In [50]:
print(Y_test)

[ 550 1800]


# Feature Scaling

In [51]:
#feature scaling is not requrired for this dataset since it has only one independent categorical value and one numerical column

# Training the Multiple linear regression model on the Training set

In [52]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# Predicting the Test set result

In [53]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), Y_test.reshape(len(Y_test),1)), 1))      

#reshape allows to display vertical to horizontal and viceversa, axis=0 means vertical and axis=1 is horizontal concatination))

[[ 557.42  550.  ]
 [1794.04 1800.  ]]


In [56]:
print(regressor.predict([[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 45000, 2000, 150, 4.2]]))


[43021.21]


In [None]:
# if we have Auto Insurance of Premium amount $45000 and claims amount as $2000, number of policies as 150, and customer satisfaction score of 4.2
# then AIG will get profit of $ 43021

In [57]:
print(regressor.coef_)

[ 2.12e-04 -1.47e-03  4.41e-03 -4.79e-03  0.00e+00  0.00e+00  1.63e-03
  1.00e+00 -9.90e-01  9.90e-02 -2.66e-03]


In [58]:
print(regressor.intercept_)

-12.853631188019563
