In [1]:
import tensorflow as tf
import numpy as np
# Then Set Random Seeds
tf.keras.utils.set_random_seed(42)
tf.random.set_seed(42)
np.random.seed(42)
# Then run the Enable Deterministic Operations Function
tf.config.experimental.enable_op_determinism()

In [2]:
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers
from tensorflow.keras import optimizers
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn import set_config
set_config(transform_output='pandas')
pd.set_option('display.max_colwidth', 250)

In [3]:
# Load from your file path
df_demo = pd.read_csv("Data/programming-or-data-science.csv")
df_demo.head(5)

Unnamed: 0,text,label
0,Python's simplicity makes it a go-to language for rapid development.,programming/python
1,"Data Science is like a treasure hunt, and Python is my map!",both
2,Python frameworks like Django make web development a piece of cake.,programming/python
3,Python's ecosystem is rich with libraries for both Data Science and Web Development.,both
4,"With NLP, sentiment analysis is easier and more accurate.",data science


In [4]:
# Define X and y
X = df_demo['text']
y_string= df_demo['label']
# Instantiate the LabelEncoder
encoder = LabelEncoder()
# Fit and Transform the strings into integers
y = pd.Series(encoder.fit_transform(y_string))
y.value_counts()

1    31
2    13
0     6
dtype: int64

In [5]:
# Convert to Dataset object
ds = tf.data.Dataset.from_tensor_slices((X, y))
ds

<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [6]:
# shuffling the data once
ds = ds.shuffle(buffer_size=len(ds), reshuffle_each_iteration=False)

In [7]:
# Determing how many samples for each split
# Calculate the number of samples for training 
split_train = 0.7
n_train_samples =  int(len(ds) * split_train)
print(f"Use {n_train_samples} samples as training data")
# Calculate the number of samples for validation
split_val = 0.2
n_val_samples = int(len(ds) * split_val)
print(f"Use {n_val_samples} samples as validation data")
# Test size is remainder
split_test = 1 - (split_train + split_val)
print(f"The remaining {len(ds)- (n_train_samples+n_val_samples)} samples will be used as test data.")

Use 35 samples as training data
Use 10 samples as validation data
The remaining 5 samples will be used as test data.


In [8]:
# Use .take to slice out the number of samples for training
train_ds = ds.take(n_train_samples)
# Skipover the training batches
val_ds = ds.skip(n_train_samples)
# Take .take to slice out the correct number of samples for validation
val_ds = val_ds.take(n_val_samples)
# Skip over all of the training + validation samples, the rest remain as samples for testing
test_ds = ds.skip(n_train_samples + n_val_samples)

In [9]:
# Shuffling just the training data
train_ds  = train_ds.shuffle(buffer_size = len(train_ds))

In [10]:
#  Setting the batch_size for all datasets
BATCH_SIZE = 1
train_ds = train_ds.batch(BATCH_SIZE)
val_ds = val_ds.batch(BATCH_SIZE)
test_ds = test_ds.batch(BATCH_SIZE)
# Confirm the number of batches in each
print (f' There are {len(train_ds)} training batches.')
print (f' There are {len(val_ds)} validation batches.')
print (f' There are {len(test_ds)} testing batches.')

 There are 35 training batches.
 There are 10 validation batches.
 There are 5 testing batches.


In [11]:
# taking a sample
example_X, example_y= train_ds.take(1).get_single_element()
print(example_X)
print(example_y)

tf.Tensor([b'I love programming, I would give it an A+!'], shape=(1,), dtype=string)
tf.Tensor([2], shape=(1,), dtype=int32)


In [12]:
# Get just the text from ds_train
ds_texts = train_ds.map(lambda x, y: x)
# Preview the text
ds_texts.take(1).get_single_element()

<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'The power of Data Science lies in turning raw data into meaningful insights.'],
      dtype=object)>

In [13]:
# Create the TextVectorization layer
count_vectorizer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    output_mode="count"
)

In [14]:
# Before training, only contains the out of vocab token ([UNK])
count_vectorizer.get_vocabulary()

['[UNK]']

In [15]:
# Fit the layer on the training texts
count_vectorizer.adapt(ds_texts)

In [16]:
# Getting list of vocab
vocab = count_vectorizer.get_vocabulary()
# Exploring list of vocab
type(vocab), len(vocab), vocab[:6]

(list, 180, ['[UNK]', 'is', 'data', 'the', 'science', 'python'])

In [17]:
# The first value will be the count of all of the words not in the vocobulary
counts= count_vectorizer(['python python python python is the most amazing thing in the world for data science!'])
counts

<tf.Tensor: shape=(1, 180), dtype=float32, numpy=
array([[3., 1., 1., 2., 1., 4., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0.]], dtype=float32)>

In [18]:
# Create Text Vectorization Layer
tfidf_vectorizer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    output_mode="tf_idf",
)
# Build the vectorizer vocabulary
tfidf_vectorizer.adapt(ds_texts)
# Confrim vocabulary size
tfidf_vectorizer.vocabulary_size()

180

In [19]:
# The first value will be the score of all of the words not in the vocobulary
tfidf= tfidf_vectorizer(['python python python python is the most amazing thing in the world for data science!'])
tfidf

<tf.Tensor: shape=(1, 180), dtype=float32, numpy=
array([[8.083353  , 0.95200884, 1.0799202 , 2.3184738 , 1.2039728 ,
        5.2250066 , 1.3062516 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.5869651 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        

In [20]:
# Create text Vectorization layer
sequence_vectorizer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=30
)
sequence_vectorizer.adapt(ds_texts)
sequence_vectorizer.vocabulary_size()

181

In [21]:
# Obtain the sequence of sample text with the sequence_vectorizer
sequence = sequence_vectorizer(['python python python python is the most amazing thing in the world for data science!'])
sequence

<tf.Tensor: shape=(1, 30), dtype=int64, numpy=
array([[  6,   6,   6,   6,   2,   4,   1, 178,   1,   7,   4,   1,  13,
          3,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int64)>

In [22]:
# Getting list of vocab
vocab = sequence_vectorizer.get_vocabulary()
int_to_str = {idx: word for idx, word in enumerate(vocab)}
int_to_str

{0: '',
 1: '[UNK]',
 2: 'is',
 3: 'data',
 4: 'the',
 5: 'science',
 6: 'python',
 7: 'in',
 8: 'of',
 9: 'a',
 10: 'to',
 11: 'language',
 12: 'and',
 13: 'for',
 14: 'like',
 15: 'programming',
 16: 'natural',
 17: 'development',
 18: 'processing',
 19: 'nlp',
 20: 'web',
 21: 'it',
 22: 'field',
 23: 'can',
 24: 'analysis',
 25: 'with',
 26: 'used',
 27: 'my',
 28: 'making',
 29: 'libraries',
 30: 'its',
 31: 'from',
 32: 'also',
 33: 'two',
 34: 'text',
 35: 'pythons',
 36: 'pandas',
 37: 'more',
 38: 'make',
 39: 'machines',
 40: 'love',
 41: 'key',
 42: 'i',
 43: 'favorite',
 44: 'both',
 45: 'be',
 46: 'are',
 47: 'an',
 48: 'would',
 49: 'widely',
 50: 'when',
 51: 'what',
 52: 'we',
 53: 'way',
 54: 'waves',
 55: 'voiceactivated',
 56: 'voice',
 57: 'visualization',
 58: 'versatility',
 59: 'versatile',
 60: 'vehicles',
 61: 'use',
 62: 'unlocks',
 63: 'turning',
 64: 'treasure',
 65: 'transforming',
 66: 'things',
 67: 'that',
 68: 'thanks',
 69: 'telescopes',
 70: 'techniqu

In [23]:
# What term corresponds to 94?
int_to_str[94]

'pod'