In [11]:
# from google.colab import drive

# drive.mount('/content/drive')

# main_path = './drive/MyDrive/NewsGeneration-NLP-Teknofest'
main_path = ".."

In [12]:
import sys

sys.path.append('../Preprocess-Data/')

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from category_reducer import category_reducer

from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import array_to_img

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model

from tensorflow import expand_dims

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GlobalAveragePooling2D

from tensorflow.keras.applications import MobileNetV2, mobilenet_v2

from tensorflow.keras.layers import Conv2D, Flatten, Reshape
from tensorflow.keras.layers import Embedding, LSTM, Dense, concatenate

from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings("ignore")

In [54]:
def load_data():
    '''
    # Load the data from the csv file and reduce the number of categories
    '''
    data_path = main_path + "/Data/news-data-with-imgs.csv"
    data = pd.read_csv(data_path)
    df = data.copy()

    # Reducing the number of categories
    df = category_reducer(df)

    print(df.info())

    df = df[['Content', 'Title', 'img_path']]
    return df

df = load_data()

# df = df.iloc[:50]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                100 non-null    object
 1   Content              100 non-null    object
 2   Content_url          100 non-null    object
 3   News_type            100 non-null    object
 4   Day_month_year_hour  100 non-null    object
 5   Img_url              100 non-null    object
 6   img_path             100 non-null    object
dtypes: object(7)
memory usage: 5.6+ KB
None


In [55]:
class PreprocessImg:
    '''
    # Class to preprocess the images and get the size information of the images.
    '''
    def __init__(self, images_path):
        self.images_path = images_path
        self.loaded_imgs = []
        self.img_arrays = []

    def load_img(self, target_size):
        for img in os.listdir(self.images_path):
            img = load_img(os.path.join(self.images_path, img), target_size=target_size)
            self.loaded_imgs.append(img)

        return self.loaded_imgs

    def img_to_array(self):
        '''
        # Load the images from the directory

        Returns:
            - loaded_imgs: List of loaded images
        '''
        for img in self.loaded_imgs:
            img = img_to_array(img)
            self.img_arrays.append(img)
        return np.array(self.img_arrays) / 255.0

    def get_size_info(self, loaded_imgs):
        '''
        # Get the size information of the images

        Args:
            - loaded_imgs: list of loaded images

        Returns:
            - weights_mean: Mean of the weights of the images
            - weights_std: Standard deviation of the weights of the images
            - heights_mean: Mean of the heights of the images
            - heights_std: Standard deviation of the heights of the images
        '''
        img_weights = []
        img_heights = []
        for size in loaded_imgs:
            img_weights.append(size.size[0])
            img_heights.append(size.size[1])

        weights_mean = np.mean(img_weights)
        weights_std = np.std(img_weights)

        heights_mean = np.mean(img_heights)
        heights_std = np.std(img_heights)

        return weights_mean, weights_std, heights_mean, heights_std

In [56]:
img_preprocessor = PreprocessImg(main_path + "/Data/imgs/")

loaded_imgs = img_preprocessor.load_img(target_size=(224, 224))
image_data = img_preprocessor.img_to_array()

widths_mean, widths_std, heights_mean, heights_std = img_preprocessor.get_size_info(loaded_imgs)

print("Mean width:", widths_mean)
print("Standard deviation of widths:", widths_std)
print("Mean height:", heights_mean)
print("Standard deviation of heights:", heights_std)
print("==")
print('Image shapes: ', image_data.shape)

Mean width: 224.0
Standard deviation of widths: 0.0
Mean height: 224.0
Standard deviation of heights: 0.0
==
Image shapes:  (100, 224, 224, 3)


In [57]:
class PreprocessText(Tokenizer):
    '''
    # Class to tokenize and pad the text data

    Args:
        - data: The text data list or series to be tokenized and padded
    '''
    def __init__(self, data):
        super().__init__(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

        self.start_mark = '<start> '
        self.end_mark = ' <end>'

        data = data.apply(lambda x: self.start_mark + x + self.end_mark)

        self.fit_on_texts(data)
        self.tokens = self.texts_to_sequences(data)

        self.numbers_of_words = [len(token) for token in self.tokens]
        self.max_tokens = max(self.numbers_of_words)

        self.padded_tokens = pad_sequences(self.tokens, padding='post', truncating='post')

        self.data = data

    # re-padding the tokens
    def re_pad(self, max_tokens=None):
        '''
        # Re-pad the tokens to a new maximum length

        Args:
            - max_tokens: The new maximum lenght of the tokens
        '''

        self.padded_tokens = pad_sequences(self.tokens, maxlen=max_tokens, padding='post', truncating='post')
        return self.padded_tokens
    def get_info(self):
        '''
        # Get the information about the tokenized and padded data
        '''
        print("Max tokens: ", self.max_tokens)
        print("Mean tokens: ", int(np.mean(self.numbers_of_words)))
        print("Standard deviation of tokens: ", int(np.std(self.numbers_of_words)))
        print("Vocabulary Size: ", len(self.word_index) +1 )
        print('Shape of padded tokens: ', self.padded_tokens.shape)

In [58]:
tokenizer = PreprocessText(df['Title'])

padded_tokens = tokenizer.padded_tokens
max_tokens = tokenizer.max_tokens
vocab_size = len(tokenizer.word_index) + 1

start_token = tokenizer.word_index[tokenizer.start_mark.strip()]
end_token = tokenizer.word_index[tokenizer.end_mark.strip()]

tokenizer.get_info()

Max tokens:  26
Mean tokens:  8
Standard deviation of tokens:  3
Vocabulary Size:  563
Shape of padded tokens:  (100, 26)


In [59]:
X_images, X_texts, y_texts = [], [], []
for img, seq in zip(image_data, padded_tokens):
    for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_tokens)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X_images.append(img)
        X_texts.append(in_seq)
        y_texts.append(out_seq)

X_images, X_texts, y_texts = np.array(X_images), np.array(X_texts), np.array(y_texts)

In [None]:
# Encoder
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

image_input = Input(shape=(224, 224, 3))
x = base_model(image_input)
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
image_embedding = Dense(32, activation='relu')(x)

# Decoder
text_input = Input(shape=(None,))
text_embedding = Embedding(vocab_size, 300)(text_input)
text_lstm = LSTM(256)(text_embedding)

combined = concatenate([image_embedding, text_lstm])
output = Dense(vocab_size, activation='softmax')(combined)

model = Model(inputs=[image_input, text_input], outputs=output)

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

In [61]:
plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [64]:
es = EarlyStopping(monitor='loss', patience=5, verbose=1)

model.fit([X_images, X_texts], y_texts,
          epochs=5, batch_size=64,
          callbacks=[es]
  )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11ca853f640>

In [68]:
def generate_text(test_image):
  initial_caption = '<start>'
  max_caption_length = 20

  initial_caption_seq = tokenizer.texts_to_sequences([initial_caption])[0]
  initial_caption_seq = pad_sequences([initial_caption_seq], maxlen=max_tokens)

  final_caption = []

  while True:
      predictions = model.predict([np.expand_dims(test_image, axis=0), initial_caption_seq])

      predicted_word_index = np.argmax(predictions)

      predicted_word = tokenizer.index_word[predicted_word_index]

      if predicted_word == '<end>':
          break

      final_caption.append(predicted_word)

      initial_caption_seq = pad_sequences([initial_caption_seq[0].tolist() + [predicted_word_index]], maxlen=max_tokens)

  final_caption = ' '.join(final_caption)

  plt.imshow(test_image)
  plt.axis("off")
  plt.show()
  print()
  print(final_caption)

In [69]:
generate_text(X_images[0])



KeyError: 0