# Import Packages
Lets load all the needed packages for this notebook:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [None]:
import tensorflow as tf
tf.__version__

# The Dataset
For this notebook we will use the Tabular Playground Series - Feb 2022 competition dataset.

Let's define the path to the dataset:

In [None]:
data: pd.DataFrame = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')
data.info()

# Quick Look at the Data
Let’s take a look at the top five rows:

In [None]:
data.head()

# Remove any columns that aren't needed from the dataset.

In [None]:
data = data.drop(['row_id'],axis=1)
data.head()

In [None]:
plt.figure(figsize = (10, 6))
plt.title('Target distribution')
plt.xticks(rotation = 30, ha='right')
ax = sns.countplot(x = data['target'])
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
len(data['target'].unique())

In [None]:
classes = data['target'].unique()
classes

In [None]:
target_dic = {'Streptococcus_pyogenes': 0,
              'Salmonella_enterica' : 1,
              'Enterococcus_hirae' : 2,
              'Escherichia_coli' : 3,
              'Campylobacter_jejuni' : 4,
              'Streptococcus_pneumoniae' : 5,
              'Staphylococcus_aureus' : 6,
              'Escherichia_fergusonii' : 7,
              'Bacteroides_fragilis' : 8,
              'Klebsiella_pneumoniae' : 9}

def convert_into_num(target):
    if target in target_dic.keys():
        return target_dic[target]

In [None]:
data.head()

# Checking null values

In [None]:
data.isnull().sum()

# Split Data

In [None]:
X = data.drop('target', axis=1).to_numpy()
y = data['target']

In [None]:
X.shape, y.shape

In [None]:
new_y = [convert_into_num(y_num) for y_num in y]
new_y[:5]

In [None]:
new_y = np.array(new_y)

# Splitting traning set

In [None]:
from sklearn.model_selection import train_test_split

tf.random.set_seed(42)

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, new_y, test_size=0.01, random_state=42)

X_train_valid.shape, X_test.shape, y_train_valid.shape, y_test.shape

In [None]:
from sklearn.model_selection import train_test_split

tf.random.set_seed(42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.05, random_state=42)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

# Building and Training our model

In [None]:
# let's build a model to find patterns in it

# Set random seed
tf.random.set_seed(42)

# 1. Create a model
model_1 = tf.keras.Sequential([
           tf.keras.layers.Dense(120, activation='relu'),
           tf.keras.layers.Dense(100, activation='selu'),
           tf.keras.layers.Dense(50, activation='selu'),
           tf.keras.layers.Dense(10, activation='softmax')
])

# 2. Comile the model
model_1.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                 metrics=['accuracy'])

# 3. Fit the model
history = model_1.fit(X_train, 
                      tf.one_hot(y_train, depth=10), 
                      epochs=25,
                      verbose = 1,
                      validation_data=(X_valid, tf.one_hot(y_valid, depth=10)))

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.0, 1.0])
plt.legend(loc='lower right');

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0.0, 1])
plt.legend(loc='upper right');

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(15, 10))
sns.heatmap(confusion_matrix(y_true=y_test, 
                 y_pred=model_1.predict(X_test).argmax(axis=1)), annot=True,
                 fmt="d");

In [None]:
model_1.evaluate(X_test, tf.one_hot(y_test, depth=10))[1] * 100

In [None]:
model_1.summary()

In [None]:
# Let's check out a way of viewing our deep learning models
from tensorflow.keras.utils import plot_model

# See the inputs and outputs of each layer
plot_model(model_1, show_shapes=True)

# Test Data

In [None]:
test_dataset = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')

In [None]:
test_passengerIds = test_dataset['row_id'].values
test_dataset=test_dataset.drop(['row_id'],axis=1)

In [None]:
test_dataset.head()

In [None]:
test_dataset.isna().sum()

In [None]:
test_dataset.info()

In [None]:
y_pred = model_1.predict(test_dataset).argmax(axis = 1)
y_pred.shape

In [None]:
y_pred[:5]

In [None]:
target_dic = {0: 'Streptococcus_pyogenes',
              1: 'Salmonella_enterica',
              2: 'Enterococcus_hirae',
              3: 'Escherichia_coli',
              4: 'Campylobacter_jejuni',
              5: 'Streptococcus_pneumoniae',
              6: 'Staphylococcus_aureus',
              7: 'Escherichia_fergusonii',
              8: 'Bacteroides_fragilis',
              9: 'Klebsiella_pneumoniae'}

def convert_into_category(target):
    if target in target_dic.keys():
        return target_dic[target]

In [None]:
y_pred_new = [convert_into_category(tar) for tar in y_pred]
y_pred_new[:5]

In [None]:
test_passengerIds.shape

In [None]:
output = pd.DataFrame({'row_id':test_passengerIds, 'target': y_pred_new})
output.to_csv('submission.csv', index=False)

In [None]:
output