In [91]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import torch.nn as nn
from torch import optim 
import torch
import torch.utils.data as Data
from sklearn.ensemble import AdaBoostRegressor

In [67]:
train_df = pd.read_csv("../input/allstate-claims-severity/train.csv")
print(train_df.shape)
train_df.head()

(188318, 132)


Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [68]:
loss = train_df["loss"]
features = train_df.drop("loss", axis = 1)
print(loss.shape)
print(features.shape)

(188318,)
(188318, 131)


### Check for Skewness in the data. 

In [69]:
#Look at cont variables
features.describe()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
count,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0
mean,294135.982561,0.493861,0.507188,0.498918,0.491812,0.487428,0.490945,0.48497,0.486437,0.485506,0.498066,0.493511,0.49315,0.493138,0.495717
std,169336.084867,0.18764,0.207202,0.202105,0.211292,0.209027,0.205273,0.17845,0.19937,0.18166,0.185877,0.209737,0.209427,0.212777,0.222488
min,1.0,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.179722
25%,147748.25,0.34609,0.358319,0.336963,0.327354,0.281143,0.336105,0.350175,0.3128,0.35897,0.36458,0.310961,0.311661,0.315758,0.29461
50%,294539.5,0.475784,0.555782,0.527991,0.452887,0.422268,0.440945,0.438285,0.44106,0.44145,0.46119,0.457203,0.462286,0.363547,0.407403
75%,440680.5,0.623912,0.681761,0.634224,0.652072,0.643315,0.655021,0.591045,0.62358,0.56682,0.61459,0.678924,0.675759,0.689974,0.724623
max,587633.0,0.984975,0.862654,0.944251,0.954297,0.983674,0.997162,1.0,0.9802,0.9954,0.99498,0.998742,0.998484,0.988494,0.844848


In [70]:
#Only one variable is over 1 skewness which is pretty good. Feature Data does not need skewness
cont_skewness_dict = dict()
for columns in features.columns:
    if columns[:4] == "cont":
        column_skewness = eval('features["{}"].skew()'.format(columns))
        cont_skewness_dict[columns] = column_skewness
print(cont_skewness_dict)
for value in cont_skewness_dict:
    if cont_skewness_dict[value] > 1 or cont_skewness_dict[value] < -1:
        print("Data skewness over at one at", str(value))
        break

#Loss needs to be logged
print(loss.skew())

{'cont1': 0.5164240212162501, 'cont2': -0.3109412513683013, 'cont3': -0.010002283912088425, 'cont4': 0.41609602949567703, 'cont5': 0.6816224364137877, 'cont6': 0.4612142679626868, 'cont7': 0.8260528331279865, 'cont8': 0.6766340713246528, 'cont9': 1.0724287198115823, 'cont10': 0.35500094742512944, 'cont11': 0.28082142843754204, 'cont12': 0.29199208040362884, 'cont13': 0.38074220048057467, 'cont14': 0.24867408719289721}
Data skewness over at one at cont9
3.7949583775378604


In [71]:
#Lets remove outliers in loss
average_loss = np.mean((loss))
maximum_loss = np.max((loss))
print("Average_loss {}".format(average_loss))
print("Maximum_loss {}".format(maximum_loss))



Average_loss 3037.3376856699792
Maximum_loss 121012.25


In [72]:
features_train = pd.get_dummies(features)
loss_train = np.log(loss)

X_train, X_test, y_train, y_test = train_test_split(features_train, loss_train, test_size = 0.33)

#Lets run a baseline model
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
base_predictions = linear_regression.predict(X_test)


print("Mean squared error {}".format(mean_squared_error(np.exp(y_test), np.exp(base_predictions))))
print("Mean absolute error {}".format(mean_absolute_error(np.exp(y_test), np.exp(base_predictions))))



Mean squared error 4990952.6796163805
Mean absolute error 1249.7197229205242


In [73]:
loss_train = np.log(loss)

### Here we are going to use PCA or Principal Component Analysis

In [74]:
#Lets get the columns for cont variables and category variables 
skimmed_features = features.drop("id", axis = 1)
category_features = skimmed_features.iloc[:, :116]
cont_features = skimmed_features.iloc[:, 116:]

In [75]:
print(category_features.shape)
print(cont_features.shape)

(188318, 116)
(188318, 14)


In [76]:
pca = PCA(n_components = 9)
cont_reduced = pca.fit_transform(cont_features)
print(cont_reduced.shape)

cont_explained = 0
for value in pca.explained_variance_ratio_:
    cont_explained += value
cont_explained
    

(188318, 9)


0.9697930991790539

In [77]:
sparse_category_features = pd.get_dummies(category_features)
sparse_category_features.shape

(188318, 1139)

In [78]:
pca = PCA(n_components = 150)
cat_reduced = pca.fit_transform(sparse_category_features)
print(cat_reduced.shape)

cat_explained = 0
for value in pca.explained_variance_ratio_:
    cat_explained += value
cat_explained

(188318, 150)


0.9166537727534473

In [79]:
#Now we are going to combine the data into one dataframe.
reduced_feature = np.hstack((cat_reduced,cont_reduced))
print("Reduced PCA Dataset: {}".format(reduced_feature.shape))

Reduced PCA Dataset: (188318, 159)


### Now we are going to try to run some supervised Models on the data

In [80]:
print(type(reduced_feature))
print("Did not use fit_transform with PCA")
print(type(loss_train))

<class 'numpy.ndarray'>
Did not use fit_transform with PCA
<class 'pandas.core.series.Series'>


In [81]:
X_train, X_test, y_train, y_test = train_test_split(reduced_feature, loss_train, test_size = 0.33)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(126173, 159)
(126173,)
(62145, 159)
(62145,)


In [82]:
def find_scores(pred, y_true):
    print("The Means absolute error: {}".format(mean_absolute_error(np.exp(y_true), np.exp(pred))))
    print("The Mean Squared error: {}".format(mean_squared_error(np.exp(y_true), np.exp(pred))))

In [83]:
print(type(X_train))
print(type(y_train))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [84]:
#Lets run a polynomial regression

# polynomial_object = PolynomialFeatures(2)
# poly_data = polynomial_object.fit_transform(X_train)

# linear_regression = LinearRegression()
# linear_regression.fit(X_train, y_train)
# polynomial_pred = linear_regression.predict(X_test)

# find_scores(polynomial_pred, y_test)
#Uncomment to use Polynomial Regression. Note: Takes a lot of RAM. 

## Decision Tree Regressor Model

In [85]:
#Now lets run a Decision Tree Regressor except now we will use GridSearchCV to validate the best model
parameters = {"max_depth": [5, 10]}
tree_scoring = make_scorer(mean_squared_error)
tree_regressor = DecisionTreeRegressor()
grid = GridSearchCV(tree_regressor, parameters, scoring = tree_scoring)
grid_fit = grid.fit(X_train, y_train)
best_tree = grid_fit.best_estimator_
print("Best estimator: {}".format(best_tree))

best_tree_fit = best_tree.fit(X_train, y_train)
tree_predictions = best_tree_fit.predict(X_test)

find_scores(tree_predictions, y_test)

Best estimator: DecisionTreeRegressor(max_depth=5)
The Means absolute error: 1423.8180418369036
The Mean Squared error: 6033486.5566932885


## Random Forest Model (Bagging Model)

In [86]:
# parameters = {"n_estimators": [5, 10]}
# random_forest = RandomForestRegressor(max_depth = 5)
# grid = GridSearchCV(random_forest, parameters, scoring = absolute_scoring)
# grid_fit = grid.fit(X_train, y_train)
# best_forest = grid_fit.best_estimator_
# print("Best random Forest{}".format(best_forest))
# print("The run time{}".format(grid_fit.refit_time_))

# best_forest_fit = best_forest.fit(X_train, y_train)
# forest_predictions = best_forest_fit.predict(X_test)

random_forest = RandomForestRegressor(n_estimators = 10, max_depth = 5)
random_forest_fit = random_forest.fit(X_train, y_train)
forest_predictions = random_forest_fit.predict(X_test)

find_scores(forest_predictions, y_test)

The Means absolute error: 1393.0462689440915
The Mean Squared error: 5888803.927847286


## Lasso Model (Regularized Linear Regression)

In [88]:
parameters = {"alpha": [0.5, 1, 2, 5]}
absolute_scoring = make_scorer(mean_absolute_error)
lasso_model = Lasso()
grid = GridSearchCV(lasso_model, parameters, scoring = absolute_scoring)
grid_fit = grid.fit(X_train, y_train)
best_lasso = grid_fit.best_estimator_
print("Best Lasso Model L1 Regularization: {}".format(best_lasso))
print("The run time: {}".format(grid_fit.refit_time_))

best_lasso_fit = best_lasso.fit(X_train, y_train)
lasso_predictions = best_lasso_fit.predict(X_test)

find_scores(lasso_predictions, y_test)

Best Lasso Model L1 Regularization: Lasso(alpha=0.5)
The run time: 0.2651042938232422
The Means absolute error: 1800.2204337642486
The Mean Squared error: 9333794.605318602


In [89]:
X_train.shape

(126173, 159)

## AdaBoostRegressor Model

In [92]:
ada_boost = AdaBoostRegressor(n_estimators = 10, learning_rate = 0.5)
ada_boost_fit = ada_boost.fit(X_train, y_train)
ada_boost_predictions = ada_boost.predict(X_test)

find_scores(ada_boost_predictions, y_test)

The Means absolute error: 1493.484300079309
The Mean Squared error: 6822108.09384632


## Linear Neural Network Pytorch

In [93]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(159, 30)
        self.fc2 = nn.Linear(30, 1)
        
        self.tanh = nn.Tanh()
        
        
    def forward(self, x):
        
        #x shape (1, 159)
        x = self.tanh(self.fc1(x))
        x = self.fc2(x)
        
        return x
        
    

In [94]:
net = Net()
optimizer = optim.Adam(net.parameters(), lr = 0.01)
criterion = nn.MSELoss()
for epoch in range(151):
    
    epoch_loss = 0
    
    optimizer.zero_grad()
    X_train = torch.FloatTensor(X_train)
    y_train = (torch.FloatTensor(y_train)).view(-1, 1)
    output = net(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    
    if epoch % 10 == 0:
        
        print("Epoch: {}".format(epoch))
        print("Loss: {:.3f}".format(np.exp(epoch_loss)))
    

Epoch: 0
Loss: 9855272550329797417369600.000
Epoch: 10
Loss: 2819477742692118036480.000
Epoch: 20
Loss: 2004717669562569.250
Epoch: 30
Loss: 20682669.206
Epoch: 40
Loss: 33.357
Epoch: 50
Loss: 1.693
Epoch: 60
Loss: 2.015
Epoch: 70
Loss: 1.672
Epoch: 80
Loss: 1.555
Epoch: 90
Loss: 1.480
Epoch: 100
Loss: 1.452
Epoch: 110
Loss: 1.432
Epoch: 120
Loss: 1.417
Epoch: 130
Loss: 1.406
Epoch: 140
Loss: 1.398
Epoch: 150
Loss: 1.391


In [95]:
X_test = torch.FloatTensor(X_test)
y_test = (torch.Tensor(y_test.values)).view(-1, 1)
test_output = net(X_test)
loss = criterion(test_output, y_test)
print("Loss: {}".format(torch.exp(loss)))

Loss: 1.427153468132019


In [96]:
print(y_test[0:10])

tensor([[8.3665],
        [8.5858],
        [7.6625],
        [6.6366],
        [7.3773],
        [7.7619],
        [8.5707],
        [8.0258],
        [7.5327],
        [6.8029]])


In [97]:
net(X_test[0:10])

tensor([[8.2666],
        [8.5761],
        [7.9743],
        [6.7963],
        [6.7228],
        [7.6892],
        [8.5619],
        [8.4828],
        [7.3714],
        [7.2460]], grad_fn=<AddmmBackward>)

## How would you batch data

pandas series --> numpy_array --> torch tensor

``` for (batch_i), images, labels in enumerate(loader): ```

In [98]:
torch_dataset = Data.TensorDataset(X_train, y_train)

loader = Data.DataLoader(
    dataset=torch_dataset, 
    batch_size=32)

In [99]:
iterator = iter(loader)
data, label = iterator.next()

In [100]:
print(data)
print(data.shape)

tensor([[ 9.0991e-03, -1.0198e+00,  2.0920e+00,  ..., -5.8618e-02,
         -1.0392e-01,  2.1025e-01],
        [-2.4088e+00,  4.2594e-01,  1.8614e+00,  ...,  7.7747e-02,
         -1.0327e-01,  1.3554e-02],
        [ 2.5770e+00,  1.2572e+00, -3.8530e-01,  ...,  1.0183e-01,
         -1.7675e-03,  1.6873e-01],
        ...,
        [-6.6574e-01, -1.7135e+00, -3.8765e-01,  ...,  7.0681e-02,
          2.6254e-01,  1.0193e-01],
        [-2.7694e+00, -5.5856e-01,  2.7902e-02,  ..., -5.1199e-02,
         -3.7130e-01,  1.7464e-01],
        [ 8.8886e-01, -1.8188e+00,  1.7996e+00,  ...,  1.7245e-01,
         -6.7208e-02,  7.4193e-02]])
torch.Size([32, 159])


In [101]:
print(label)
print(label.shape)

tensor([[7.7498],
        [8.2134],
        [9.7236],
        [7.8824],
        [8.7793],
        [8.8634],
        [7.9271],
        [8.7014],
        [7.7014],
        [8.2393],
        [7.0449],
        [6.1065],
        [7.3090],
        [8.0274],
        [7.5626],
        [7.1166],
        [8.4187],
        [7.7164],
        [8.3868],
        [7.3674],
        [8.5139],
        [7.0270],
        [8.8776],
        [8.8213],
        [6.8677],
        [7.9324],
        [7.0636],
        [8.1018],
        [7.4904],
        [7.8749],
        [8.1581],
        [7.8240]])
torch.Size([32, 1])
