In [1]:
from __future__ import print_function, division
import os
import cv2
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from skimage import io, transform
from torch.utils.data import Dataset, DataLoader, Subset, random_split
from torch.utils.data.dataloader import default_collate
from torchvision import transforms, utils

from bs4 import BeautifulSoup
import requests
import warnings

from keras.preprocessing.image import load_img, img_to_array
from keras.applications.inception_v3 import preprocess_input, decode_predictions, InceptionV3
from keras.models import model_from_json, Model
from sklearn.model_selection import train_test_split

import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
warnings.filterwarnings("ignore")

In [3]:
class RedditDataset(Dataset):
    """Reddit dataset."""

    def __init__(self, folder):
        """
        Arg:
            folder(string): Path to the folder that contains the csv files.
        """
        happy_data = pd.read_csv(folder + '/processed_happy.csv')
        creepy_data = pd.read_csv(folder + '/processed_creepy.csv')
        gore_data = pd.read_csv(folder + '/processed_gore.csv')
        rage_data = pd.read_csv(folder +'/processed_rage.csv')
        self.reddit_data = (pd.concat([happy_data, creepy_data, gore_data,rage_data])).sort_values(by = 'ups', ascending=False)

    def __len__(self):
        return len(self.reddit_data)

    def __getitem__(self, idx):
        
        img_url = self.reddit_data.iloc[idx, 2]
        if 'imgur' in img_url:
            if '.jpeg' not in img_url :
                if '.jpg'not in img_url : 
                    if '.png'not in img_url : 
                        requete = requests.get(img_url)
                        page = requete.content
                        soup = BeautifulSoup(page)
                        img_url = soup.find('link', rel="image_src", href=True)
                        if (img_url): 
                            img_url = img_url['href']
        img_title = self.reddit_data.iloc[idx, 6]
        img_label = self.reddit_data.iloc[idx, 8]
        
        try:
            image = io.imread(img_url)   
        except:
            print("Requested photo not available.")
            return {}
        else:
            try:
                image = transform.resize(image, (299, 299, 3))
            except:
                print("Image dimensions are too small")
                return {}
            else :
                if image.ndim == 3 :
                    sample = {'image': image,'description': img_title, 'label': img_label}
                    return(sample)

In [4]:
def my_collate(batch):
    batch = list(filter (lambda x:x is not None, batch))
    return default_collate(batch)

In [5]:
# Dataset and dataloader
reddit_dataset = RedditDataset('data/reddit_data')
loader = DataLoader(reddit_dataset, batch_size=1, shuffle=True, collate_fn=my_collate)

In [6]:
# Once model is loaded and saved you can load it everytime you need it
# load json and create model
json_file = open("models/inception_feature_extractor.json", 'r')
inception_feature_extractor = json_file.read()
json_file.close()
inception_feature_extractor = model_from_json(inception_feature_extractor)
# load weights into new model
inception_feature_extractor.load_weights("models/inception_feature_extractor.h5")
print("Loaded image feature extractor")

Loaded image feature extractor


In [7]:
image_features = []
text = []
labels = []
for sample in loader :
    if sample:
        np_image = sample['image'].numpy()
        image_vect = inception_feature_extractor.predict(np_image)
        image_features.append(image_vect)
        labels.append(sample['label'])
        text.append(sample['description'])
image_features = np.array(image_features).squeeze()
text = np.array(text)
labels = np.array(labels)
print(image_features.shape)
print(text.shape)
print(labels.shape)

Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Image dimensions are too small
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Image dimensions are too small
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Image dimensions are too small
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requested photo not available.
Requeste

In [12]:
image_features_train, image_features_test, labels_train, labels_test = train_test_split(image_features, labels, test_size=0.20, random_state=42)
text_train, text_test, labels_train, labels_test = train_test_split(text, labels, test_size=0.20, random_state=42)

In [13]:
pickle.dump(image_features_test, open(f'processed_data/image_testing_features.pkl', 'wb'))
pickle.dump(text_test, open(f'processed_data/text_testing.pkl', 'wb'))
# np.savetxt("processed_data/text_testing_features.csv", test_text, header='title', fmt='%s', comments='')
pickle.dump(labels_test, open(f'processed_data/testing_labels.pkl', 'wb'))

In [10]:
# pickle.dump(valid_image_features, open(f'processed_data/image_validation_features.pkl', 'wb'))
# pickle.dump(valid_text, open(f'processed_data/text_validation.pkl', 'wb'))
# # np.savetxt("processed_data/text_validation_features.csv", valid_text, header='title', fmt='%s', comments='')
# pickle.dump(valid_labels, open(f'processed_data/validation_labels.pkl', 'wb'))

In [14]:
pickle.dump(image_features_train, open(f'processed_data/image_training_features.pkl', 'wb'))
pickle.dump(text_train, open(f'processed_data/text_training.pkl', 'wb'))
# np.savetxt("processed_data/text_training_features.csv", train_text, header='title', fmt='%s', comments='')
pickle.dump(labels_train, open(f'processed_data/training_labels.pkl', 'wb'))