In [1]:
import sys 

sys.path.append('../Preprocess-Data/')

In [2]:
import numpy as np 
import pandas as pd 
import os 

from category_reducer import category_reducer
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings 
warnings.filterwarnings("ignore")

In [3]:
def load_data():
    '''
    # Load the data from the csv file and reduce the number of categories
    '''
    data_path = "../Data/news-data-with-imgs.csv"
    data = pd.read_csv(data_path)
    df = data.copy()

    # Reducing the number of categories 
    df = category_reducer(df)

    print(df.info())

    df = df[['Content', 'Title', 'img_path']]
    return df

In [4]:
df = load_data()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                100 non-null    object
 1   Content              100 non-null    object
 2   Content_url          100 non-null    object
 3   News_type            100 non-null    object
 4   Day_month_year_hour  100 non-null    object
 5   Img_url              100 non-null    object
 6   img_path             100 non-null    object
dtypes: object(7)
memory usage: 5.6+ KB
None


In [5]:
class PreprocessImg:
    '''
    # Class to preprocess the images and get the size information of the images.
    '''
    def __init__(self):
        self.images_path = "../Data/imgs/"
        
    def load_img(self):
        '''
        # Load the images from the directory

        Returns:
            - loaded_imgs: List of loaded images
        '''
        loaded_imgs = []
        for img in os.listdir(self.images_path):
            loaded_imgs.append(image.load_img(os.path.join(self.images_path, img)))
        return loaded_imgs

    def get_size_info(self, loaded_imgs):
        '''
        # Get the size information of the images 

        Args:
            - loaded_imgs: list of loaded images

        Returns:
            - weights_mean: Mean of the weights of the images
            - weights_std: Standard deviation of the weights of the images
            - heights_mean: Mean of the heights of the images
            - heights_std: Standard deviation of the heights of the images
        '''
        img_weights = []
        img_heights = []
        for size in loaded_imgs:
            img_weights.append(size.size[0])
            img_heights.append(size.size[1])
        
        weights_mean = np.mean(img_weights)
        weights_std = np.std(img_weights)
        
        heights_mean = np.mean(img_heights)
        heights_std = np.std(img_heights)

        return weights_mean, weights_std, heights_mean, heights_std

In [6]:
img_preprocessor = PreprocessImg()

loaded_imgs = img_preprocessor.load_img()

widths_mean, widths_std, heights_mean, heights_std = img_preprocessor.get_size_info(loaded_imgs)
print("Mean width:", widths_mean)
print("Standard deviation of widths:", widths_std)
print("Mean height:", heights_mean)
print("Standard deviation of heights:", heights_std)

Mean width: 1029.85
Standard deviation of widths: 355.39463628479257
Mean height: 709.22
Standard deviation of heights: 297.7434660911974


In [7]:
class PreprocessText(Tokenizer):
    def __init__(self, data):
        Tokenizer.__init__(self)

        self.fit_on_texts(data)
        self.tokens = self.texts_to_sequences(data)

        self.numbers_of_words = [len(token) for token in self.tokens]
        self.max_tokens = int(np.mean(self.numbers_of_words) + 2 * np.std(self.numbers_of_words))

        self.padded_tokens = pad_sequences(self.tokens, maxlen=self.max_tokens, padding='post', truncating='post')

In [9]:
title_preprocessor = PreprocessText(df['Title'])

title_padded_tokens = title_preprocessor.padded_tokens
title_max_tokens = title_preprocessor.max_tokens

print("Max tokens:", title_max_tokens)
print('Shape of padded tokens:', title_padded_tokens.shape)

Max tokens: 13
Shape of padded tokens: (100, 13)


In [10]:
content_preprocessor = PreprocessText(df['Content'])

content_padded_tokens = content_preprocessor.padded_tokens
content_max_tokens = content_preprocessor.max_tokens

print("Max tokens:", content_max_tokens)
print('Shape of padded tokens:', content_padded_tokens.shape)

Max tokens: 406
Shape of padded tokens: (100, 406)
