# TF Data Input Pipeline

In [36]:
import tensorflow as tf

# Creating tf dataset from a list

In [37]:
daily_sales_numbers = [21, 22,-108, 31, -1, 32, 34, 31]   #daily sales cant be negative so, these are all the data errors

#### we want to build a tf dataset from the above 

In [38]:
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

### Iterating through elements as numpy elements

In [39]:
for sales in tf_dataset.as_numpy_iterator():
    print(sales)
    
    ##next method that gives same result
# for sales in tf_dataset:
#     print(sales)    
    
## here, individual element is tenser So, to convert 

21
22
-108
31
-1
32
34
31


### in above output, individual element is tenser So, if we wanted to convert in Numpy object  then using a Numpy Function

In [40]:
for sales in tf_dataset:
    print(sales.numpy()) 

21
22
-108
31
-1
32
34
31


## Filtering sales numbers that are less than 0 
##### sales number cant be - so filtering those data points .. By using filter function

In [41]:
tf_dataset = tf_dataset.filter(lambda x: x>0)
for sales in tf_dataset.as_numpy_iterator():
    print(sales) 

21
22
31
32
34
31


### the above code is giving the warning  as :  Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. 
### so it may not be useful in the future so the below mentioned code is the updated version of the above code

In [42]:
# Define a lambda function to filter the dataset
def filter_fn(x):
    return x > 0

# Filter the dataset using the filter() method
tf_dataset = tf_dataset.filter(filter_fn)

# Iterate over the filtered dataset
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

21
22
31
32
34
31


## suppose the above sales are in dollar so we wanna convert it in nepali currenncy
### we use map function => it will take each individual element & it will apply that particular function

In [43]:
tf_dataset = tf_dataset.map(lambda x: x*120)  #x is the individual element so we convert here for all the elements
for sales in tf_dataset:
    print(sales.numpy())


2520
2640
3720
3840
4080
3720


## Shuffling the datas
### we may want to randomly shuffle the datas especially when we are doing image data analysis 

In [44]:
tf_dataset = tf_dataset.shuffle(2) #buffer of size 2
for sales in tf_dataset:
    print(sales.numpy())

2520
2640
3840
4080
3720
3720


<h1>Batching </h1>

<h3>Lots of images are stored on harddisk so we cannot processs all those images at once so we need to process those images in the form of batch 
</h3>

<h4>df.batch(n)   ,  n => no of batches you want to create</h4>



In [45]:
for sales in tf_dataset.batch(2):
    print(sales.numpy())

[2520 2640]
[3840 4080]
[3720 3720]


<h1 style="color:red;"> Forming all of the above code in a single line </h1>

In [46]:
tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset = tf_dataset.filter(lambda x: x>0).map(lambda y: y *120).shuffle(2).batch(2)
for sales in tf_dataset:
    print(sales.numpy())

[446400 460800]
[316800 489600]
[302400 446400]


# reading cat and dog images from the folder

In [47]:
images_ds = tf.data.Dataset.list_files('E:\python/images/*/*', shuffle = False)
for file in images_ds.take(5):
    print(file.numpy())

b'E:\\python\\images\\cat\\image1.jpeg'
b'E:\\python\\images\\cat\\image11.jpg'
b'E:\\python\\images\\cat\\image12.jpg'
b'E:\\python\\images\\cat\\image22.jpg'
b'E:\\python\\images\\cat\\image23.jpg'


In [48]:
# by shuffling the above code
images_ds = images_ds.shuffle(200)
for file in images_ds.take(3):
    print(file.numpy())

b'E:\\python\\images\\dog\\dog77.jpg'
b'E:\\python\\images\\cat\\image11.jpg'
b'E:\\python\\images\\cat\\images9.jpeg'


In [49]:
class_names = ["cat", "dog"] # class names hami sanga cat rw dog xa

In [50]:
image_count = len(images_ds)
image_count

24

In [51]:
train_size = int(image_count*0.8) # lets say trainning size is 0.8, 80% of my sample are of training size 

train_ds = images_ds.take(train_size) ## take function will take first 80% of images as datasets
test_ds = images_ds.skip(train_size)  # skip is opposite of take, skip will skip the first 80% of th images of dataset


In [52]:
len(train_ds)

19

In [53]:
len(test_ds)  ##So total ma 40 ota thyo, take le 32  ota images ligyo and test le remaining 

5

# from images path we need to retrive the lable  (path stringma xa we retrive label)

In [54]:
# for example
s = 'E:\\python\\images\\dog\\images.jfif'
s.split("\\")[-2]

'dog'

In [55]:
def get_label(file_path):
    import os
    return tf.strings.split(file_path, os.path.sep)[-2]


In [56]:
## first lets see what we have in the train_ds :

for t in train_ds.take(5): #take(5) is just like a .head() in the pandas , it takes only 5 items present in the datasets
    print(t.numpy())

b'E:\\python\\images\\cat\\images5.jfif'
b'E:\\python\\images\\dog\\dog77.jpg'
b'E:\\python\\images\\dog\\dog55.jpg'
b'E:\\python\\images\\cat\\images6.jfif'
b'E:\\python\\images\\cat\\images7.jfif'


In [57]:
 #map function will apply get_label function to all the elements in train_ds
for label in train_ds.map(get_label):
    print(label)

tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)


# From the above output we only got cat and dog (i.e our y part)

## X train in the below image is our actual image data .. and y train is our cat and dog(Y)

<img src="image.png" style="height:50%, width:50%">

## Now, We are gonna get the x part i.e our actual image data 

In [58]:
def process_image(file_path):
    label = get_label(file_path) 
    
    img = tf.io.read_file(file_path)  # in tensorflow , this will read the file path
    #our file is a jfif image, we need to decode the image
    img= tf.image.decode_jpeg(img)
    #our images are of different dimension we should resize 
    img = tf.image.resize(img, [128,128])
    
    return img, label

In [67]:
train_ds = train_ds.map(process_image)
for image, label in train_ds.take(3):
    print("Image :", image)
    print("label :", label)

Image : tf.Tensor(
[[[167.91156   142.91156   135.91156  ]
  [172.        147.        140.       ]
  [174.64453   151.64453   143.64453  ]
  ...
  [ 13.805481   11.899231   17.805481 ]
  [ 14.328125   13.328125   18.328125 ]
  [ 18.389221   16.389221   21.389221 ]]

 [[169.51141   144.51141   137.51141  ]
  [173.3858    148.3858    141.3858   ]
  [176.45312   153.45312   145.45312  ]
  ...
  [ 15.338318   13.432068   19.338318 ]
  [ 14.328125   13.328125   18.328125 ]
  [ 20.109375   18.109375   23.109375 ]]

 [[172.76642   147.76642   140.76642  ]
  [176.55707   151.55707   144.55707  ]
  [178.36719   155.36719   147.36719  ]
  ...
  [ 19.62854    15.897949   23.972473 ]
  [ 17.82727    15.870239   20.87024  ]
  [ 20.152344   18.152344   23.152344 ]]

 ...

 [[179.        150.        142.       ]
  [179.        150.        142.       ]
  [179.        150.        142.       ]
  ...
  [ 36.372314   30.614685   32.684937 ]
  [ 73.33594    51.881287   53.50934  ]
  [126.282166   83.84076 

## We got the numpy array in above output so we need to scale it 

 we need to convert those numbers in the range of 0,1 so thats why we are scaling

In [71]:
def scale(image, label):
    return image/255, label   #RGB values are between 0 to 255 so we scaled in 255

In [72]:
train_ds = train_ds.map(scale)
for image, label in train_ds.take(3):
    print("Image :", image.numpy()[0][0]) # we didnt printed the entrire image, we just printed first few elements
    print("label :", label.numpy())

Image : [1.5378702e-05 1.5324988e-05 1.5083753e-05]
label : b'dog'
Image : [8.3225914e-06 1.3509133e-05 1.5016849e-05]
label : b'dog'
Image : [1.5378702e-05 1.5378702e-05 1.5378702e-05]
label : b'dog'


# Questions

Movie reviews are present as individual text file (one file per review) in review folder.


Folder structure looks like this,

                        reviews
                            |__ positive
                                |__pos_1.txt
                                |__pos_2.txt
                                |__pos_3.txt
                            |__ negative
                                |__neg_1.txt
                                |__neg_2.txt
                                |__neg_3.txt
                                
                                
You need to read these reviews using tf.data.Dataset and perform following transformations,

    1. Read text review and generate a label from folder name.Your dataset should have review text and label as a tuple
    2. Filter blank text review. Two files are blank in this dataset
    3. Do all of the above transformations in single line of code. Also shuffle all the review

In [79]:
import tensorflow as tf

## Now we gonna retrive those file paths in a tf dataset

In [86]:
df = tf.data.Dataset.list_files('Exercise/reviews/*/*', shuffle=False)

In [87]:
for items in df:
    print(items.numpy())

b'Exercise\\reviews\\negative\\neg_1.txt'
b'Exercise\\reviews\\negative\\neg_2.txt'
b'Exercise\\reviews\\negative\\neg_3.txt'
b'Exercise\\reviews\\positive\\pos_1.txt'
b'Exercise\\reviews\\positive\\pos_2.txt'
b'Exercise\\reviews\\positive\\pos_3.txt'


### 1. Read text review and generate a label from folder name.Your dataset should have review text and label as a tuple

In [97]:
import os 
def review_label(file_path):
    return tf.io.read_file(file_path), tf.strings.split(file_path, os.path.sep)[-2]   
#tf.io.read_file(file_path) = reads the contents of the file at the given file_path location 
                              # and returns the content as a string tensor. 
# tf.strings.split(file_path,os.path.sep)[-2] => splits the file_path string using the path separator (os.path.sep) and 
                                           # returns the second-to-last element of the resulting split string as a tensor.

In [98]:
mapped_review = df.map(review_label)
#since the question is asking that we should have review text and label so 
for review, label in mapped_review:
    print("REVIEW : " , review.numpy()[:50])
    print("LABEL : ", label.numpy())

REVIEW :  b"Basically there's a family where a little boy (Jak"
LABEL :  b'negative'
REVIEW :  b'This show was an amazing, fresh & innovative idea '
LABEL :  b'negative'
REVIEW :  b''
LABEL :  b'negative'
REVIEW :  b'One of the other reviewers has mentioned that afte'
LABEL :  b'positive'
REVIEW :  b'A wonderful little production. <br /><br />The fil'
LABEL :  b'positive'
REVIEW :  b''
LABEL :  b'positive'


### 2. Filter blank text review. Two files are blank in this dataset

In [99]:
# Define a lambda function to filter the review
def filter_df(label):
    return label
filtered_review = mapped_review.filter(lambda review, label : review!= "") #to remove any reviews with empty text 
for review, label in filtered_review:
    print("REVIEW : " , review.numpy()[:50])
    print("LABEL : ", label.numpy())

REVIEW :  b"Basically there's a family where a little boy (Jak"
LABEL :  b'negative'
REVIEW :  b'This show was an amazing, fresh & innovative idea '
LABEL :  b'negative'
REVIEW :  b'One of the other reviewers has mentioned that afte'
LABEL :  b'positive'
REVIEW :  b'A wonderful little production. <br /><br />The fil'
LABEL :  b'positive'


### 3. Do all of the above transformations in single line of code. Also shuffle all the review

<h3 style ="color: red"> Performing map, filter and shuffle all in single line of code</h3>

In [101]:
final_transformation_ds = df.map(review_label).filter(lambda review, label : review != "").shuffle(3)
for review, label in final_transformation_ds.as_numpy_iterator():
    print("Review:",review[:50])
    print("Label:",label)

Review: b"Basically there's a family where a little boy (Jak"
Label: b'negative'
Review: b'A wonderful little production. <br /><br />The fil'
Label: b'positive'
Review: b'One of the other reviewers has mentioned that afte'
Label: b'positive'
Review: b'This show was an amazing, fresh & innovative idea '
Label: b'negative'
