# Machine Learning

In [1]:
# Imports
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding, TextVectorization
from tensorflow.keras.models import Sequential

In [2]:
df = pd.read_json("final_data_for_machine_learning.json")

In [3]:
X = df["processed_text"]
y = df[df.columns[9:]].values

In [4]:
y

array([[1],
       [2],
       [1],
       ...,
       [1],
       [2],
       [2]])

In [5]:
vectorizer = TextVectorization(output_mode="int")

2023-08-17 23:25:42.970405: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2023-08-17 23:25:42.970424: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-08-17 23:25:42.970428: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-08-17 23:25:42.970457: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-08-17 23:25:42.970471: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
vectorizer.adapt(X.values)

2023-08-17 23:25:43.053817: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [7]:
vectorizer("Hello World")

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([1115,  129])>

In [8]:
vectorized_text = vectorizer(X.values)

In [9]:
vectorized_text

<tf.Tensor: shape=(189489, 472), dtype=int64, numpy=
array([[ 452,  547,    4, ...,    0,    0,    0],
       [2702,   82, 4782, ...,    0,    0,    0],
       [  46,  396,   31, ...,    0,    0,    0],
       ...,
       [   9,   16,  575, ...,    0,    0,    0],
       [ 348,   20,  253, ...,    0,    0,    0],
       [   8,    4,  575, ...,    0,    0,    0]])>

In [10]:
len(X)

189489

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [12]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [13]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(200001, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dense(128, activation='tanh'))
model.add(Dense(128, activation='tanh'))
# Final layer 
model.add(Dense(1, activation='tanh'))

In [14]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam', metrics=['accuracy'])

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 128)               16512     
                                                        

In [16]:
history = model.fit(train, epochs=5, validation_data=val)

Epoch 1/5


2023-08-17 23:25:54.979648: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-17 23:25:55.306954: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-17 23:25:55.320193: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


   1/8290 [..............................] - ETA: 3:41:11 - loss: 6.1687 - accuracy: 0.1250

2023-08-17 23:25:55.556942: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-17 23:25:55.578592: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-08-17 23:34:38.996188: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-17 23:34:39.295586: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-08-17 23:34:39.306350: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
 263/8290 [..............................] - ETA: 8:21 - loss: -1.7721 - accuracy: 0.3923

KeyboardInterrupt: 