In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [114]:
raw_train_df = pd.read_csv("../data/raw/train.csv")
raw_test_df = pd.read_csv("../data/raw/test.csv")


In [115]:
print(raw_train_df.describe())
print(raw_test_df.describe())
display(raw_train_df.head(10))
display(raw_test_df.head(10))

                  id      feature_0      feature_1      feature_2  \
count  200000.000000  200000.000000  200000.000000  200000.000000   
mean    99999.500000       0.972710       1.168365       2.219325   
std     57735.171256       3.941836       3.993407       6.476570   
min         0.000000       0.000000       0.000000       0.000000   
25%     49999.750000       0.000000       0.000000       0.000000   
50%     99999.500000       0.000000       0.000000       0.000000   
75%    149999.250000       1.000000       1.000000       1.000000   
max    199999.000000      61.000000      51.000000      64.000000   

           feature_3      feature_4      feature_5      feature_6  \
count  200000.000000  200000.000000  200000.000000  200000.000000   
mean        2.296735       0.793530       1.431105       1.010695   
std         7.551858       2.935785       5.162746       3.949231   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.0000

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,Class_6
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,Class_6
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,Class_2
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,Class_8
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_2
5,5,0,15,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_8
6,6,0,1,2,1,0,2,0,0,0,...,1,2,0,2,0,1,0,0,0,Class_6
7,7,2,3,5,0,0,1,1,2,10,...,8,0,0,0,0,2,3,60,0,Class_3
8,8,1,0,0,35,6,2,2,0,3,...,0,37,0,5,4,1,0,0,0,Class_2
9,9,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,10,0,Class_8


Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
0,200000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,200001,1,2,0,0,0,0,0,0,0,...,3,1,3,0,0,0,0,3,0,0
2,200002,0,1,7,1,0,0,0,0,6,...,3,0,0,0,0,3,0,2,0,0
3,200003,0,0,0,4,3,1,0,0,0,...,0,0,0,1,0,0,0,4,0,0
4,200004,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,200005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
6,200006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,2,2,1,0,0
7,200007,2,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,200008,0,0,1,2,0,0,0,0,0,...,0,3,0,0,5,0,0,1,3,0
9,200009,0,1,0,3,0,1,1,18,2,...,2,1,62,1,0,2,1,0,22,0


### 1. Data Cleaning

In [None]:
# Ignoring this - assuming data is clean.

In [101]:
print(list(set(raw_train_df['target'].values)))

['Class_5', 'Class_8', 'Class_2', 'Class_6', 'Class_7', 'Class_3', 'Class_4', 'Class_1', 'Class_9']


### 2. Feature Generation

In [None]:
# Writing a super basic standardization function to preprocess tf.Datasets
def preprocess_standardization()

##### 2.1. Approach 1

In [116]:
# Dropping id
train_df = raw_train_df.drop(columns = ['id'])
test_df = raw_test_df.drop(columns = ['id'])

# Updating Target column values
target_list = list(set(train_df['target'].values))
target_dict = {index: value for index, value in enumerate(target_list)}
# train_df = train_df.replace({'target': target_dict}, inplace = True)
train_df.target = pd.Categorical(train_df.target)
train_df['target'] = train_df.target.cat.codes

# Breaking Training into train and validation sets
validation_df = train_df.sample(frac=0.2, random_state=1337)
train_df = train_df.drop(validation_df.index)

In [164]:
# Converting dataframe to tf.data.Dataset format using tf.data.Dataset.from_tensor_slices
def create_tf_datasets(df, test = False, n_repeat = 1, shuffle_buffer_size = 500, 
                       n_parse_threads = 2, batch_size = 32):
#     if test == False:
#         label = df.pop("target")
#         dataset = tf.data.Dataset.from_tensor_slices((dict(df), label))
#     else:
#         dataset = tf.data.Dataset.from_tensor_slices((dict(df)))
    
    if test == False:
        label = df.pop("target")
        data = tf.constant(np.array(df), dtype = tf.float32)
        label = tf.constant(np.array(label), 'float32')
        dataset = tf.data.Dataset.from_tensors((data, label))
    else:
        dataset = tf.data.Dataset.from_tensors((dict(df)))
    #dataset = tf.data.Dataset.from_tensor_slices((dict(df)))
    X_mean = df.mean().values
    X_std = df.std().values
    n_inputs = len(X_mean)
    def preprocess_standardization(line, label):
#         defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
#         fields = tf.io.decode_csv(line, record_defaults=defs)
#         x = tf.stack(fields)
        x = tf.stack(line)
        y = label
        return (x - X_mean) / X_std, y
    dataset = dataset.map(preprocess_standardization, num_parallel_calls = n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(n_repeat)
    return dataset.batch(batch_size).prefetch(1)
#     return dataset.batch(batch_size)
    

In [165]:
# Get the train_df, validation_df, and test_df
train_df_1 = train_df.copy()
validation_df_1 = validation_df.copy()
test_df_1 = test_df.copy()
train_dataset = create_tf_datasets(train_df_1)
validation_dataset = create_tf_datasets(validation_df_1)
test_dataset = create_tf_datasets(test_df_1, test = True)

TypeError: in user code:


    TypeError: tf__preprocess_standardization() missing 1 required positional argument: 'label'


In [147]:
dict(train_df_1)

{'feature_0': 0         0
 1         0
 2         0
 3         0
 5         0
          ..
 199994    0
 199996    0
 199997    1
 199998    0
 199999    5
 Name: feature_0, Length: 160000, dtype: int64,
 'feature_1': 0          0
 1          0
 2          0
 3          0
 5         15
           ..
 199994     2
 199996     2
 199997     2
 199998     0
 199999     4
 Name: feature_1, Length: 160000, dtype: int64,
 'feature_2': 0         6
 1         0
 2         0
 3         7
 5         0
          ..
 199994    1
 199996    0
 199997    0
 199998    2
 199999    0
 Name: feature_2, Length: 160000, dtype: int64,
 'feature_3': 0          1
 1          0
 2          0
 3          0
 5          0
           ..
 199994     1
 199996     0
 199997     0
 199998     0
 199999    10
 Name: feature_3, Length: 160000, dtype: int64,
 'feature_4': 0         0
 1         0
 2         0
 3         1
 5         0
          ..
 199994    2
 199996    1
 199997    0
 199998    2
 199999    0
 Name:

##### 2.2. Approach 2

In [175]:
# Using the tf.data.Dataset.list_files(filenames) & tf.data.Dataset.interleave(lambda, skip = 1, num_parallel_calls)
def create_tf_datasets_directly(filepaths, test = False, n_repeat = 1, suffle_buffer_size = 500, 
                       n_parse_threads = 2, batch_size = 32, n_readers = 2, n_read_threads = 2):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),cycle_length=n_readers, num_parallel_calls=n_read_threads)
    df = pd.DataFrame(dataset)
    X_mean = df.mean().values
    X_std = df.std().values
    n_inputs = len(X_mean)
    def preprocess_standardization(line):
        defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
        fields = tf.io.decode_csv(line, record_defaults=defs)
        x = tf.stack(fields[:-1])
        y = tf.stack(fields[-1:])
        return (x - X_mean) / X_std, y
    dataset = dataset.map(preprocess_standardization, num_parallel_calls = n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(n_repeat)
    return dataset.batch(batch_size).prefetch(1)
    

In [176]:
train_dataset = create_tf_datasets_directly(filepaths = ["../data/raw/train.csv"])

TypeError: Argument 'objects' has incorrect type (expected numpy.ndarray, got tensorflow.python.framework.ops.EagerTensor)

In [141]:
tf.data.experimental.cardinality(train_dataset)

<tf.Tensor: shape=(), dtype=int64, numpy=1>

##### 2.3. Approach 3

In [118]:
train_df_2 = train_df.copy()
validation_df_2 = validation_df.copy()
test_df_2 = test_df.copy()

train_labels = train_df_2.pop("target")


In [119]:
train_labels = np.array(train_labels)

In [120]:
train_df_2 = train_df_2.loc[1:, :]


In [121]:
train_labels = train_labels[1:]

In [122]:
train_df_3 = np.array(train_df_2)
print(train_df_3)
print(train_df_3.shape)

[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 7 ... 4 3 0]
 ...
 [1 2 0 ... 1 0 0]
 [0 0 2 ... 0 1 0]
 [5 4 0 ... 2 3 1]]
(159999, 75)


In [123]:
train_labels

array([5, 1, 7, ..., 7, 6, 7], dtype=int8)

### 3. Model Creation

In [50]:
input_dim = 76

In [25]:
model = Sequential()
model.add(Dense(12, input_dim=(1, 76), activation='relu'))
model.add(Dense(8, input_dim = input_dim, activation='relu'))
model.add(Dense(1, input_dim = input_dim, activation='softmax'))
# model.add(Dense(12, activation='relu'))
# model.add(Dense(8,  activation='relu'))
# model.add(Dense(1, activation='softmax'))

TypeError: Dimension value must be integer or None or have an __index__ method, got value '(1, 76)' with type '<class 'tuple'>'

In [144]:
model = keras.Sequential([
    layers.Dense(20, activation='relu', input_shape=(None, 75)),
    layers.Dense(5, activation='relu', input_shape=(None, 75)),
    layers.Dense(1, activation = 'softmax', input_shape=(None, 75))
  ])

In [145]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [146]:
# model.fit(train_dataset, epochs=10, batch_size = 32, validation_data=validation_dataset)
model.fit(train_dataset, epochs=10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feabc177ca0>

In [124]:
model = keras.Sequential([
    tf.keras.layers.Flatten(),
    layers.Dense(20, activation='relu'),
    layers.Dense(5, activation='relu'),
    layers.Dense(1, activation = 'softmax')
  ])

In [125]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [126]:
# model.fit(train_dataset, epochs=10, batch_size = 32, validation_data=validation_dataset)
model.fit(train_df_3, train_labels, epochs=10)

Epoch 1/10
Epoch 2/10

_NotOkStatusException: InvalidArgumentError: Error while reading CompositeTensor._type_spec.

### 4. Model Inference

In [None]:
# Run basic test 
model.evaluate(test_df)

In [None]:
# Run inference flow


In [62]:
train, test = tf.keras.datasets.fashion_mnist.load_data()

images, labels = train
images = images/255.0
labels = labels.astype(np.int32)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [63]:
fmnist_train_ds = tf.data.Dataset.from_tensor_slices((images, labels))
fmnist_train_ds = fmnist_train_ds.shuffle(5000).batch(32)

model = tf.keras.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(10)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

2022-02-02 19:18:05.411263: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 376320000 exceeds 10% of free system memory.


In [64]:
model.fit(fmnist_train_ds, epochs=2)

2022-02-02 19:18:14.181177: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 376320000 exceeds 10% of free system memory.
2022-02-02 19:18:14.293119: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 376320000 exceeds 10% of free system memory.


Epoch 1/2
  32/1875 [..............................] - ETA: 3s - loss: 1.7457 - accuracy: 0.3838  

2022-02-02 19:18:14.538786: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/2
  40/1875 [..............................] - ETA: 2s - loss: 0.4807 - accuracy: 0.8391 

2022-02-02 19:18:17.347465: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 376320000 exceeds 10% of free system memory.




<keras.callbacks.History at 0x7feabc0b73a0>