# Neural Network Regression - No Stratification

# Project 3 - Predicting the Age of Abalone

In [1]:
# Dependancies

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## Data Preprocessing

In [2]:
# Read in unscaled version of the dataset
unscaled_df = pd.read_csv('./Dataset/abalone_unscaled_data.csv')
unscaled_df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Age
0,M,91,73,19,102.8,44.9,20.2,30.0,15,16.5
1,M,70,53,18,45.1,19.9,9.7,14.0,7,8.5
2,F,106,84,27,135.4,51.3,28.3,42.0,9,10.5
3,M,88,73,25,103.2,43.1,22.8,31.0,10,11.5
4,I,66,51,16,41.0,17.9,7.9,11.0,7,8.5


In [3]:
# Label Encode the categorical "Sex" column
# Note: Female => 0
#       Infant => 1
#       Male   => 2
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
unscaled_df['Sex_LabelEncoded'] = le.fit_transform(unscaled_df['Sex'])
unscaled_df.head()


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Age,Sex_LabelEncoded
0,M,91,73,19,102.8,44.9,20.2,30.0,15,16.5,2
1,M,70,53,18,45.1,19.9,9.7,14.0,7,8.5,2
2,F,106,84,27,135.4,51.3,28.3,42.0,9,10.5,0
3,M,88,73,25,103.2,43.1,22.8,31.0,10,11.5,2
4,I,66,51,16,41.0,17.9,7.9,11.0,7,8.5,1


In [4]:
# Binary encode the categorical "Sex" column
unscaled_df = pd.get_dummies(unscaled_df, prefix=['Sex'], columns=['Sex'])
unscaled_df.head()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Age,Sex_LabelEncoded,Sex_F,Sex_I,Sex_M
0,91,73,19,102.8,44.9,20.2,30.0,15,16.5,2,0,0,1
1,70,53,18,45.1,19.9,9.7,14.0,7,8.5,2,0,0,1
2,106,84,27,135.4,51.3,28.3,42.0,9,10.5,0,1,0,0
3,88,73,25,103.2,43.1,22.8,31.0,10,11.5,2,0,0,1
4,66,51,16,41.0,17.9,7.9,11.0,7,8.5,1,0,1,0


In [5]:
# Reorganize columns
column_order = ["Sex_LabelEncoded", "Sex_M", "Sex_F", "Sex_I", "Length", 
                "Diameter", "Height", "Whole_weight", "Shucked_weight", 
                "Viscera_weight", "Shell_weight", "Rings", "Age"]
unscaled_df = unscaled_df.reindex(columns=column_order)
unscaled_df.head()

Unnamed: 0,Sex_LabelEncoded,Sex_M,Sex_F,Sex_I,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Age
0,2,1,0,0,91,73,19,102.8,44.9,20.2,30.0,15,16.5
1,2,1,0,0,70,53,18,45.1,19.9,9.7,14.0,7,8.5
2,0,0,1,0,106,84,27,135.4,51.3,28.3,42.0,9,10.5
3,2,1,0,0,88,73,25,103.2,43.1,22.8,31.0,10,11.5
4,1,0,0,1,66,51,16,41.0,17.9,7.9,11.0,7,8.5


## Determine X and y (Example using Sex_LabelEncoded and y=Rings)

In [6]:
#For the neural network classification, the categories must have one-hot encoding, not just label encoding
#That is why only the one-hot encoded columns for sex were included and not the other versions

# Determine X 
X = unscaled_df[['Sex_M', 'Sex_F', 'Sex_I', 'Length', 'Diameter', 'Height', 'Whole_weight', 
                 'Shucked_weight', 'Viscera_weight', 'Shell_weight']]
X

Unnamed: 0,Sex_M,Sex_F,Sex_I,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,1,0,0,91,73,19,102.8,44.9,20.2,30.0
1,1,0,0,70,53,18,45.1,19.9,9.7,14.0
2,0,1,0,106,84,27,135.4,51.3,28.3,42.0
3,1,0,0,88,73,25,103.2,43.1,22.8,31.0
4,0,0,1,66,51,16,41.0,17.9,7.9,11.0
...,...,...,...,...,...,...,...,...,...,...
4172,0,1,0,113,90,33,177.4,74.0,47.8,49.8
4173,1,0,0,118,88,27,193.2,87.8,42.9,52.1
4174,1,0,0,120,95,41,235.2,105.1,57.5,61.6
4175,0,1,0,125,97,30,218.9,106.2,52.2,59.2


## Prepare Data for Stratification

In [7]:
#The one-hot encoding for the rings is done in later cells
# Here we are determining the classes with only one value and removing them so that stratification on the train-test split will work
# Stratification is necessary because measureing the training v testing requires the same number of classes

# Determine y
y = unscaled_df["Rings"]
y

0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: Rings, Length: 4177, dtype: int64

In [8]:
y.max()

29

In [9]:
y.min()

1

In [10]:
pd.Series(y).unique().size

28

In [11]:
a = np.array(y)
unique, counts = np.unique(a, return_counts=True)
tally = dict(zip(unique, counts))
tally

{1: 1,
 2: 1,
 3: 15,
 4: 57,
 5: 115,
 6: 259,
 7: 391,
 8: 568,
 9: 689,
 10: 634,
 11: 487,
 12: 267,
 13: 203,
 14: 126,
 15: 103,
 16: 67,
 17: 58,
 18: 42,
 19: 32,
 20: 26,
 21: 14,
 22: 6,
 23: 9,
 24: 2,
 25: 1,
 26: 1,
 27: 2,
 29: 1}

In [12]:
# This is to record the classes that only have a single value
# Classes with single value have to be removed in order to do stratification

solo_classes=[];

for key in tally:
    if tally[key] == 1:
        solo_classes.append(key)
solo_classes

[1, 2, 25, 26, 29]

In [13]:
# This removes the rows with the classes with the single values for both X and y
solo_indices=[];
for i in range(len(solo_classes)):
    print(i)
    j = pd.Series(y)[pd.Series(y) == solo_classes[i]].index
    print(j)
    solo_indices.append(j)

0
Int64Index([236], dtype='int64')
1
Int64Index([719], dtype='int64')
2
Int64Index([2201], dtype='int64')
3
Int64Index([294], dtype='int64')
4
Int64Index([480], dtype='int64')


In [14]:
X.shape

(4177, 10)

In [15]:
for i in range(len(solo_indices)):
    print(i)
    X = X.drop(solo_indices[i])

0
1
2
3
4


In [16]:
X.shape

(4172, 10)

In [17]:
y.shape

(4177,)

In [18]:
for i in range(len(solo_indices)):
    print(i)
    y = y.drop(solo_indices[i])

0
1
2
3
4


In [19]:
y.shape

(4172,)

## Perform Train-Test-Split

In [20]:
# Must reshape y to have the correct vector to instead be a two-dimensional numpy array - matrix format
# This is needed for the other functions to use it correctly

y = y.values.reshape(-1,1)

In [21]:
y.shape

(4172, 1)

In [22]:
from sklearn.model_selection import train_test_split

# Stratification is necessary to ensure that the test and training data are directly comparable
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=1)

## Scale Data and Evaluate


In [23]:
# Scale everything, and then unscale prediction to plot, but for the evaluation does not need to be unscaled unless RMSE

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_scaler = StandardScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [24]:
# Deep Neural Network layout from https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33, but then simplified
# Model Set Up
NN_model = Sequential()

# kernel_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01)
#Add the above parameter to every dense layer helps with overfitting

# The Input Layer :
NN_model.add(Dense(128, input_dim = X_train_scaled.shape[1], activation='relu', kernel_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01)
))

#This is a simple neural network because it only contains one hidden layer between the input and output layers
# The Hidden Layers :
NN_model.add(Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01)
))

# The Output Layer :
NN_model.add(Dense(1, activation='linear', kernel_regularizer=keras.regularizers.l1_l2(l1=0.01, l2=0.01)
))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam')
NN_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1408      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 34,689
Trainable params: 34,689
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Training the model - Run

NN_model.fit(X_train_scaled, y_train_scaled, epochs=1000, batch_size=32, validation_split = 0.2)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x1f838b8a250>

In [26]:
predictions = NN_model.predict(X_test_scaled)
predictions.shape

(1043, 1)

In [27]:
model_loss = NN_model.evaluate(
    X_test_scaled, y_test_scaled, verbose=2)

33/33 - 0s - loss: 0.6047


In [28]:
r2 = r2_score(y_test_scaled, predictions)

In [29]:
mse = mean_squared_error(y_test_scaled, predictions)

In [30]:
print(f"Loss: {model_loss}")
print(f"R-squared: {r2}")
print(f"Mean Squared Error: {mse}")

Loss: 0.604674756526947
R-squared: 0.46434683211806715
Mean Squared Error: 0.5358007125098536
