In [74]:
import pandas as pd
import numpy as np

np.random.seed(1212)

import keras
from keras.models import Model
from keras.layers import *
from keras import optimizers

In [75]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [76]:
df_train.head() # 784 features, 1 label

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
df_features = df_train.iloc[:, 1:785]
df_label = df_train.iloc[:, 0]

X_test = df_test.iloc[:, 0:784]

print(X_test.shape)

(28000, 784)


In [78]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and cross-validation sets
X_train, X_cv, y_train, y_cv = train_test_split(
    df_features, df_label,
    test_size=0.2,
    random_state=1212
)

# Convert DataFrames to NumPy arrays and reshape
X_train = X_train.values.reshape(-1, 784)  # Reshape automatically to (33600, 784)
X_cv = X_cv.values.reshape(-1, 784)        # Reshape to (8400, 784)
X_test = X_test.values.reshape(-1, 784)    # Reshape to (28000, 784)


In [79]:
print((min(X_train[1]), max(X_train[1])))

(np.int64(0), np.int64(255))


In [80]:
# Feature Normalization
X_train = X_train.astype('float32'); X_cv= X_cv.astype('float32'); X_test = X_test.astype('float32')
X_train /= 255; X_cv /= 255; X_test /= 255

# Convert labels to One Hot Encoded
num_digits = 10
y_train = keras.utils.to_categorical(y_train, num_digits)
y_cv = keras.utils.to_categorical(y_cv, num_digits)

In [81]:
print((min(X_train[1]), max(X_train[1])))

(np.float32(0.0), np.float32(1.0))


In [82]:
# Input Parameters
n_input = 784 # number of features
n_hidden_1 = 300
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 200
num_digits = 10

In [83]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)


In [84]:
# Our model would have '6' layers - input layer, 4 hidden layer and 1 output layer
model = Model(Inp, output)
model.summary() # We have 297,910 parameters to estimate

In [85]:
from tensorflow.keras import optimizers

# Insert Hyperparameters
learning_rate = 0.1
training_epochs = 20
batch_size = 100

# Initialize SGD optimizer
sgd = optimizers.SGD(learning_rate=learning_rate)


In [86]:
# We rely on the plain vanilla Stochastic Gradient Descent as our optimizing methodology
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

In [87]:
history1 = model.fit(X_train, y_train,
                     batch_size = batch_size,
                     epochs = training_epochs,
                     verbose = 2,
                     validation_data=(X_cv, y_cv))

Epoch 1/20
336/336 - 4s - 11ms/step - accuracy: 0.4305 - loss: 1.9632 - val_accuracy: 0.7264 - val_loss: 1.1678
Epoch 2/20
336/336 - 2s - 7ms/step - accuracy: 0.8228 - loss: 0.7012 - val_accuracy: 0.8719 - val_loss: 0.4776
Epoch 3/20
336/336 - 2s - 7ms/step - accuracy: 0.8822 - loss: 0.4194 - val_accuracy: 0.8973 - val_loss: 0.3633
Epoch 4/20
336/336 - 3s - 9ms/step - accuracy: 0.9021 - loss: 0.3427 - val_accuracy: 0.9095 - val_loss: 0.3170
Epoch 5/20
336/336 - 3s - 10ms/step - accuracy: 0.9124 - loss: 0.3038 - val_accuracy: 0.9154 - val_loss: 0.2923
Epoch 6/20
336/336 - 4s - 13ms/step - accuracy: 0.9198 - loss: 0.2768 - val_accuracy: 0.9202 - val_loss: 0.2716
Epoch 7/20
336/336 - 2s - 7ms/step - accuracy: 0.9265 - loss: 0.2557 - val_accuracy: 0.9263 - val_loss: 0.2618
Epoch 8/20
336/336 - 2s - 7ms/step - accuracy: 0.9314 - loss: 0.2378 - val_accuracy: 0.9282 - val_loss: 0.2434
Epoch 9/20
336/336 - 3s - 9ms/step - accuracy: 0.9373 - loss: 0.2215 - val_accuracy: 0.9354 - val_loss: 0.229

In [88]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# Hyperparameters
learning_rate = 0.001  # Adam usually works better with smaller learning rates
n_hidden_1 = 128
n_hidden_2 = 64
n_hidden_3 = 64
n_hidden_4 = 32
num_digits = 10  # For MNIST

# Build model
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name="Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name="Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name="Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name="Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name="Output_Layer")(x)

# Create and compile model
adam = Adam(learning_rate=learning_rate)
model2 = Model(Inp, output)

model2.compile(
    loss='categorical_crossentropy',
    optimizer=adam,
    metrics=['accuracy']
)


In [89]:
history2 = model2.fit(X_train, y_train,
                      batch_size = batch_size,
                      epochs = training_epochs,
                      verbose = 2,
                      validation_data=(X_cv, y_cv))

Epoch 1/20
336/336 - 5s - 14ms/step - accuracy: 0.8708 - loss: 0.4481 - val_accuracy: 0.9389 - val_loss: 0.2158
Epoch 2/20
336/336 - 4s - 12ms/step - accuracy: 0.9518 - loss: 0.1613 - val_accuracy: 0.9593 - val_loss: 0.1387
Epoch 3/20
336/336 - 3s - 8ms/step - accuracy: 0.9665 - loss: 0.1101 - val_accuracy: 0.9620 - val_loss: 0.1272
Epoch 4/20
336/336 - 2s - 6ms/step - accuracy: 0.9746 - loss: 0.0815 - val_accuracy: 0.9680 - val_loss: 0.1109
Epoch 5/20
336/336 - 2s - 7ms/step - accuracy: 0.9799 - loss: 0.0642 - val_accuracy: 0.9677 - val_loss: 0.1072
Epoch 6/20
336/336 - 3s - 8ms/step - accuracy: 0.9848 - loss: 0.0496 - val_accuracy: 0.9689 - val_loss: 0.1048
Epoch 7/20
336/336 - 4s - 13ms/step - accuracy: 0.9873 - loss: 0.0412 - val_accuracy: 0.9707 - val_loss: 0.1098
Epoch 8/20
336/336 - 3s - 8ms/step - accuracy: 0.9885 - loss: 0.0341 - val_accuracy: 0.9656 - val_loss: 0.1300
Epoch 9/20
336/336 - 2s - 6ms/step - accuracy: 0.9921 - loss: 0.0253 - val_accuracy: 0.9692 - val_loss: 0.113

In [90]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam


# Define the model architecture
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name="Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name="Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name="Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name="Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name="Output_Layer")(x)

# Instantiate optimizer
adam = Adam(learning_rate=learning_rate)

# Build and compile the model
model2a = Model(Inp, output)
model2a.compile(loss='categorical_crossentropy',
                optimizer=adam,
                metrics=['accuracy'])


In [91]:
history2a = model2a.fit(X_train, y_train,
                        batch_size = batch_size,
                        epochs = training_epochs,
                        verbose = 2,
                        validation_data=(X_cv, y_cv))

Epoch 1/20
336/336 - 5s - 16ms/step - accuracy: 0.8744 - loss: 0.4256 - val_accuracy: 0.9399 - val_loss: 0.2013
Epoch 2/20
336/336 - 2s - 7ms/step - accuracy: 0.9537 - loss: 0.1542 - val_accuracy: 0.9595 - val_loss: 0.1433
Epoch 3/20
336/336 - 4s - 11ms/step - accuracy: 0.9660 - loss: 0.1105 - val_accuracy: 0.9581 - val_loss: 0.1385
Epoch 4/20
336/336 - 2s - 6ms/step - accuracy: 0.9736 - loss: 0.0827 - val_accuracy: 0.9652 - val_loss: 0.1170
Epoch 5/20
336/336 - 2s - 7ms/step - accuracy: 0.9806 - loss: 0.0620 - val_accuracy: 0.9645 - val_loss: 0.1168
Epoch 6/20
336/336 - 2s - 6ms/step - accuracy: 0.9825 - loss: 0.0551 - val_accuracy: 0.9655 - val_loss: 0.1268
Epoch 7/20
336/336 - 2s - 6ms/step - accuracy: 0.9856 - loss: 0.0452 - val_accuracy: 0.9689 - val_loss: 0.1104
Epoch 8/20
336/336 - 2s - 6ms/step - accuracy: 0.9910 - loss: 0.0304 - val_accuracy: 0.9724 - val_loss: 0.1112
Epoch 9/20
336/336 - 3s - 9ms/step - accuracy: 0.9899 - loss: 0.0299 - val_accuracy: 0.9655 - val_loss: 0.1356

In [92]:

# Build model
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name="Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name="Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name="Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name="Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name="Output_Layer")(x)

# Optimizer
adam = Adam(learning_rate=learning_rate)

# Model
model2b = Model(Inp, output)
model2b.compile(
    loss='categorical_crossentropy',
    optimizer=adam,  # use the Adam instance
    metrics=['accuracy']
)


In [93]:
history2b = model2b.fit(X_train, y_train,
                        batch_size = batch_size,
                        epochs = training_epochs,
                            validation_data=(X_cv, y_cv))

Epoch 1/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.7342 - loss: 0.8552 - val_accuracy: 0.9437 - val_loss: 0.1931
Epoch 2/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9519 - loss: 0.1594 - val_accuracy: 0.9567 - val_loss: 0.1431
Epoch 3/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9666 - loss: 0.1071 - val_accuracy: 0.9619 - val_loss: 0.1316
Epoch 4/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9749 - loss: 0.0804 - val_accuracy: 0.9690 - val_loss: 0.1041
Epoch 5/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9828 - loss: 0.0578 - val_accuracy: 0.9702 - val_loss: 0.1018
Epoch 6/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9857 - loss: 0.0443 - val_accuracy: 0.9675 - val_loss: 0.1116
Epoch 7/20
[1m336/336[0m 

In [94]:
# Input Parameters
n_input = 784 # number of features
n_hidden_1 = 300
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 100
n_hidden_5 = 200
num_digits = 10

In [95]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
x = Dense(n_hidden_5, activation='relu', name = "Hidden_Layer_5")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

In [96]:
# Our model would have '7' layers - input layer, 5 hidden layer and 1 output layer
model3 = Model(Inp, output)
model3.summary() # We have 308,010 parameters to estimate

In [97]:
from tensorflow.keras.optimizers import Adam

# Define Adam optimizer with a specific learning rate
adam = Adam(learning_rate=0.01)

# Compile the model using the Adam instance
model3.compile(
    loss='categorical_crossentropy',
    optimizer=adam,
    metrics=['accuracy']
)


In [98]:
history3 = model3.fit(X_train, y_train,
                      batch_size = batch_size,
                      epochs = training_epochs,
                      validation_data=(X_cv, y_cv))

Epoch 1/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.7746 - loss: 0.6748 - val_accuracy: 0.9385 - val_loss: 0.2146
Epoch 2/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9490 - loss: 0.1946 - val_accuracy: 0.9474 - val_loss: 0.2245
Epoch 3/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9563 - loss: 0.1616 - val_accuracy: 0.9512 - val_loss: 0.1929
Epoch 4/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9645 - loss: 0.1337 - val_accuracy: 0.9571 - val_loss: 0.1851
Epoch 5/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9667 - loss: 0.1352 - val_accuracy: 0.9637 - val_loss: 0.1597
Epoch 6/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9742 - loss: 0.1008 - val_accuracy: 0.9656 - val_loss: 0.1540
Epoch 7/20
[1m336/336

In [99]:
# Input Parameters
n_input = 784 # number of features
n_hidden_1 = 300
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 200
num_digits = 10

In [100]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dropout(0.3)(x)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dropout(0.3)(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dropout(0.3)(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

In [101]:
# Our model would have '6' layers - input layer, 4 hidden layer and 1 output layer
model4 = Model(Inp, output)
model4.summary() # We have 297,910 parameters to estimate

In [102]:
model4.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [103]:
test_pred = pd.DataFrame(model4.predict(X_test, batch_size=200))
test_pred = pd.DataFrame(test_pred.idxmax(axis = 1))
test_pred.index.name = 'ImageId'
test_pred = test_pred.rename(columns = {0: 'Label'}).reset_index()
test_pred['ImageId'] = test_pred['ImageId'] + 1

test_pred.head()

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


Unnamed: 0,ImageId,Label
0,1,7
1,2,9
2,3,9
3,4,9
4,5,9


In [104]:
test_pred.to_csv('mnist_submission.csv', index = False)