In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import layers
from keras.models import Sequential

from tensorflow.keras.layers import IntegerLookup

2023-04-21 22:00:09.708454: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'click_time': 'str',
    'attributed_time': 'str',
    'is_attributed': 'uint8'
}

raw_df = pd.read_csv('../dataSets/final_training_sample.csv', dtype=dtypes)
raw_df.head()



Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,0,36183,12,1,3,245,2017-11-06 23:32:20,,0
1,1,77792,15,1,17,265,2017-11-06 16:22:50,,0
2,2,147957,9,1,19,232,2017-11-06 22:22:34,,0
3,3,28082,15,1,19,245,2017-11-06 21:00:04,,0
4,4,105128,12,1,13,245,2017-11-06 23:04:56,,0


### Feature Engineering

In [3]:
## Isolating day of week and hour of day to see if they're good indicators of downloads
d = pd.to_datetime(raw_df['click_time'])
day = d.dt.dayofweek
hour = d.dt.hour
raw_df['day_clicked'] = day
raw_df['hour_clicked'] = hour
print(raw_df.shape)
raw_df.head()

(942229, 11)


Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day_clicked,hour_clicked
0,0,36183,12,1,3,245,2017-11-06 23:32:20,,0,0,23
1,1,77792,15,1,17,265,2017-11-06 16:22:50,,0,0,16
2,2,147957,9,1,19,232,2017-11-06 22:22:34,,0,0,22
3,3,28082,15,1,19,245,2017-11-06 21:00:04,,0,0,21
4,4,105128,12,1,13,245,2017-11-06 23:04:56,,0,0,23


### Splitting data

In [4]:
## Selecting features
features = raw_df.iloc[:, 2:6]
features = features.join(raw_df.iloc[:, 9:])
print(features.shape)
features.head()

(942229, 6)


Unnamed: 0,app,device,os,channel,day_clicked,hour_clicked
0,12,1,3,245,0,23
1,15,1,17,265,0,16
2,9,1,19,232,0,22
3,15,1,19,245,0,21
4,12,1,13,245,0,23


In [5]:
## Selecting labels
labels = raw_df.iloc[:, 8]
print(labels.shape)
labels.head()

(942229,)


0    0
1    0
2    0
3    0
4    0
Name: is_attributed, dtype: uint8

In [6]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.01, random_state=42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(932806, 6) (932806,)
(9423, 6) (9423,)


## Creating NN Model

In [7]:
## Building base model without categorical embeddings
input_layer = keras.Input(shape=(6,), name="input")
core_layer = layers.Dense(units=320, activation="relu", name="Dense-1")(input_layer)
core_layer = layers.Dense(units=160, activation="relu", name="Dense-2")(core_layer)
core_layer = layers.Dense(units=80, activation="relu", name="Dense-3")(core_layer)
core_layer = layers.Dense(units=40, activation="relu", name="Dense-4")(core_layer)
output_layer = layers.Dense(1, activation="sigmoid")(core_layer)

model = keras.Model(inputs=input_layer, outputs=output_layer)


2023-04-21 22:00:16.160494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 6)]               0         
                                                                 
 Dense-1 (Dense)             (None, 320)               2240      
                                                                 
 Dense-2 (Dense)             (None, 160)               51360     
                                                                 
 Dense-3 (Dense)             (None, 80)                12880     
                                                                 
 Dense-4 (Dense)             (None, 40)                3240      
                                                                 
 dense (Dense)               (None, 1)                 41        
                                                                 
Total params: 69,761
Trainable params: 69,761
Non-trainable p

In [9]:
## Training the model
callback = tf.keras.callbacks.EarlyStopping(monitor='acc', patience=2)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=['acc'])
history = model.fit(x_train, y_train, batch_size=254, epochs=10, validation_split=0.1, callbacks=callback)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
## Evaluating model

model.evaluate(x_test, y_test)




[0.25421372056007385, 0.9048073887825012]

### Embedded NN Model
Creating model with embedding layers for the categorial features. Guidance based on information found in following resources: 
- https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/structured_data/ipynb/structured_data_classification_from_scratch.ipynb#scrollTo=yC-2OqU5vo6D
- https://medium.com/@satnalikamayank12/on-learning-embeddings-for-categorical-data-using-keras-165ff2773fc9
- https://github.com/knathanieltucker/deep-learning-building-blocks/blob/master/notebooks/Categorical%20Data.ipynb

#### Creating TFdatasets

In [11]:
# Checking cardinality of features
features.nunique().reset_index(name='cardinality')

# for feature in features.columns:
#     print(len(features[feature].unique()))

Unnamed: 0,index,cardinality
0,app,337
1,device,1890
2,os,191
3,channel,181
4,day_clicked,4
5,hour_clicked,24


In [12]:
final_df = features.join(labels)
final_df

Unnamed: 0,app,device,os,channel,day_clicked,hour_clicked,is_attributed
0,12,1,3,245,0,23,0
1,15,1,17,265,0,16,0
2,9,1,19,232,0,22,0
3,15,1,19,245,0,21,0
4,12,1,13,245,0,23,0
...,...,...,...,...,...,...,...
942224,10,1,27,113,3,15,1
942225,5,1,47,113,3,15,1
942226,45,1,32,419,3,15,1
942227,10,1,11,113,3,15,1


In [13]:
## Creatting train/dev/test sets

val_df = final_df.sample(frac=0.01, random_state=42)
# print(val_df.shape)
train_df = final_df.drop(val_df.index)
# print(train_df.shape)
test_df = train_df.sample(frac=0.01, random_state=42)
# print(test_df.shape)
train_df = train_df.drop(test_df.index)
# print(train_df.shape)


In [14]:
## turning df into ds
def dataframe_to_dataset(df):
    dataframe = df.copy()
    labels = dataframe.pop('is_attributed')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_df)
val_ds = dataframe_to_dataset(val_df)
test_ds = dataframe_to_dataset(test_df)

In [15]:
for x, y in train_ds.take(1):
    print("Inputs: ", x)
    print("Targets: ", y)

Inputs:  {'app': <tf.Tensor: shape=(), dtype=uint16, numpy=19>, 'device': <tf.Tensor: shape=(), dtype=uint16, numpy=0>, 'os': <tf.Tensor: shape=(), dtype=uint16, numpy=748>, 'channel': <tf.Tensor: shape=(), dtype=uint16, numpy=347>, 'day_clicked': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'hour_clicked': <tf.Tensor: shape=(), dtype=int64, numpy=11>}
Targets:  tf.Tensor(0, shape=(), dtype=uint8)


In [16]:
## creating batches from ds
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)
test_ds = test_ds.batch(32)


#### Encoding features and building model

In [17]:
# Function  to one-hot encode integer categorical features

def encode_categorical_feature(feature, name, dataset):
    lookup_class = IntegerLookup
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature



In [18]:
# Creating input layers
app = keras.Input(shape=(1,), name="app", dtype="int64")
device = keras.Input(shape=(1,), name="device", dtype="int64")
os = keras.Input(shape=(1,), name="os", dtype="int64")
channel = keras.Input(shape=(1,), name="channel", dtype="int64")
day_clicked = keras.Input(shape=(1,), name="day_clicked", dtype="int64")
hour_clicked = keras.Input(shape=(1,), name="hour_clicked", dtype="int64")

all_inputs = [
    app,
    device,
    os,
    channel,
    day_clicked,
    hour_clicked

]

In [19]:
# Categorical features encoded as integers
app_encoded = encode_categorical_feature(app, 'app', train_ds)
device_encoded = encode_categorical_feature(device, 'device', train_ds)
os_encoded = encode_categorical_feature(os, 'os', train_ds)
channel_encoded = encode_categorical_feature(channel, 'channel', train_ds)
day_clicked_encoded = encode_categorical_feature(day_clicked, 'day_clicked', train_ds)
hour_clicked_encoded = encode_categorical_feature(hour_clicked, 'hour_clicked', train_ds)

all_features = layers.concatenate(
    [
        app_encoded,
        device_encoded,
        os_encoded,
        channel_encoded,
        day_clicked_encoded,
        hour_clicked_encoded

    ]
)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [20]:
x = layers.Dense(320, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(all_inputs, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])



In [21]:
callback = tf.keras.callbacks.EarlyStopping(monitor='acc', patience=2)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=['acc'])
history = model.fit(train_ds, epochs=5, validation_data=val_ds, callbacks=callback)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model.evaluate(test_ds)



[0.2142351120710373, 0.9195969104766846]

## Hypertuning Parameters

In [23]:
# Adding layers and batch normalization
x2 = layers.Dropout(0.25)(all_features)
x2 = layers.Dense(320, activation='relu')(x2)

x2 = layers.BatchNormalization()(x2)
x2 = layers.Dropout(0.25)(x2)
x2 = layers.Dense(160, activation='relu')(x2)

x2 = layers.BatchNormalization()(x2)
x2 = layers.Dropout(0.25)(x2)
x2 = layers.Dense(80, activation='relu')(x2)

x2 = layers.BatchNormalization()(x2)
x2 = layers.Dropout(0.25)(x2)
x2 = layers.Dense(10, activation='relu')(x2)

x2 = layers.BatchNormalization()(x2)
x2 = layers.Dropout(0.25)(x2)
output = layers.Dense(1, activation="relu")(x2)

four_layer_model = keras.Model(all_inputs, output)
# four_layer_model.compile("adam", "binary_crossentropy", metrics=["accuracy"])


In [24]:
callback = tf.keras.callbacks.EarlyStopping(monitor='acc', patience=2)
four_layer_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=['acc'])
history = four_layer_model.fit(train_ds, epochs=5, validation_data=val_ds, callbacks=callback)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
four_layer_model.evaluate(test_ds)



[0.31269875168800354, 0.9134863018989563]