# Binary Classification. UCI Sentiment Labelled Sentences

In [1]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

print(f"Tensor Flow Version: {tf.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available." if gpu else "NOT AVAIALBLE!!!")
print(tf.config.list_physical_devices())

Tensor Flow Version: 2.8.0
GPU is available.
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Import & Explore the Data (EDA)

In [2]:
colnames = ['sentence','positive']
raw = pd.read_csv('../data/uci_sentiment_lablelled_sentences/imdb_labelled.txt', sep=" \t", lineterminator="\n", names=colnames, engine='python', header=None)
raw

#engine='python' : The default 'c'- engine considers my sep value regex and will not parse it. 

Unnamed: 0,sentence,positive
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


In [None]:
raw.dtypes

In [None]:
wrangler = raw
wrangler['length'] = wrangler.sentence.str.len()
wrangler_sorted = wrangler.sort_values( by=['length'], ascending=False).head(20)
wrangler_sorted

In [None]:
# reading through the 20 longest reviews. Checking for HTML or other examples of characters that need cleaning.
for i in range(0,20):
    print(wrangler_sorted.iloc[i][0])
    print()

## Load the Data

In [6]:
features = tf.convert_to_tensor(raw.sentence)
features

<tf.Tensor: shape=(1000,), dtype=string, numpy=
array([b'A very, very, very slow-moving, aimless movie about a distressed, drifting young man. ',
       b'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out. ',
       b'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent. ',
       b'Very little music or anything to speak of. ',
       b'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head. ',
       b"The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty. ",
       b'Wasted two hours. ',
       b'Saw the movie today and thought it was a good effort, good messages for kids. ',
       b'A bit predictable. ',
       b'Loved the casting of Jimmy Buffet as the science teacher. ',
       b'And those baby o

In [7]:
target = tf.convert_to_tensor(raw.positive)
target

<tf.Tensor: shape=(1000,), dtype=int64, numpy=
array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
    

In [None]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(raw.sentence.values, tf.string),
            tf.cast(raw.positive.values, tf.int32)
        )
    )
)

for features_tensor, target_tensor in training_dataset:
    print(f'features. {features_tensor} target:{target_tensor}')

In [None]:
for elem in training_dataset:
    print(elem[0])

In [10]:
for elem in features:
    print(elem)

tf.Tensor(b'A very, very, very slow-moving, aimless movie about a distressed, drifting young man. ', shape=(), dtype=string)
tf.Tensor(b'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out. ', shape=(), dtype=string)
tf.Tensor(b'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent. ', shape=(), dtype=string)
tf.Tensor(b'Very little music or anything to speak of. ', shape=(), dtype=string)
tf.Tensor(b'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head. ', shape=(), dtype=string)
tf.Tensor(b"The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty. ", shape=(), dtype=string)
tf.Tensor(b'Wasted two hours. ', shape=(), dtype=string)
tf.Tensor(b'Saw the movie today and thought it was a good effort, good mess

## Prepare Data for Training (Text Vectorization)

In [None]:
VOCAB_SIZE = 10000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(training_dataset.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

In [None]:
for example, label in training_dataset.take(3):
    print('text:', example.numpy())
    print('label:', label.numpy())

In [None]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

In [None]:
for elem in 

## Create the Model

## Train the Model

## Evaluate the Model

## Plots of Accuracy & Loss Over Time

## Export the Model

## Make Predictions