In [13]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.dataloader import default_collate
from torchvision import transforms, utils
import requests
from bs4 import BeautifulSoup

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [14]:
class RedditDataset(Dataset):
    """Reddit dataset."""

    def __init__(self, folder):
        """
        Arg:
            folder(string): Path to the folder that contains the csv files.
        """
        happy_data = pd.read_csv(folder + '/processed_happy.csv')
        creepy_data = pd.read_csv(folder + '/processed_creepy.csv')
        gore_data = pd.read_csv(folder + '/processed_gore.csv')
        rage_data = pd.read_csv(folder +'/processed_rage.csv')
        self.reddit_data = (pd.concat([happy_data, creepy_data, gore_data,rage_data])).sort_values(by = 'ups', ascending=False)

    def __len__(self):
        return len(self.reddit_data)

    def __getitem__(self, idx):
        
        img_url = self.reddit_data.iloc[idx, 2]
        if 'imgur' in img_url:
            if '.jpeg' not in img_url :
                if '.jpg'not in img_url : 
                    if '.png'not in img_url : 
                        requete = requests.get(img_url)
                        page = requete.content
                        soup = BeautifulSoup(page)
                        img_url = soup.find('link', rel="image_src", href=True)
                        if (img_url): 
                            img_url = img_url['href']
        img_title = self.reddit_data.iloc[idx, 6]
        img_label = self.reddit_data.iloc[idx, 8]
        
        try:
            image = io.imread(img_url)   
        except:
            print("Requested photo not available.")
        else:
            try:
                image = transform.resize(image, (299, 299, 3))
            except:
                print("Image dimensions are too small")
            else :
                if image.ndim == 3 :
                    sample = {'image': image,'description': img_title, 'label': img_label}
                    return(sample)

In [15]:
def my_collate(batch):
    batch = list(filter (lambda x:x is not None, batch))
    return default_collate(batch)

In [16]:
# Training dataset and dataloader
reddit_dataset = RedditDataset('reddit_data')
train_loader = DataLoader(reddit_dataset, batch_size=4, shuffle=True, collate_fn=my_collate)

# Testing dataset and dataloader
test_indices = list(range(0,int(len(reddit_dataset)*0.1)))
test_dataset = Subset(reddit_dataset, test_indices)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True, collate_fn=my_collate)

# Validation dataset and dataloader
validation_indices = list(range(int(len(reddit_dataset)*0.1), 2*int(len(reddit_dataset)*0.1)+1))
validation_dataset = Subset(reddit_dataset, validation_indices)
validation_loader = DataLoader(validation_dataset, batch_size=1, shuffle=True, collate_fn=my_collate)

In [17]:
import keras
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.inception_v3 import preprocess_input
from keras.applications.inception_v3 import decode_predictions
from keras.applications.inception_v3 import InceptionV3
from keras.models import model_from_json
from keras.models import Model
import cv2

In [18]:
# Once model is loaded and saved you can load it everytime you need it
# load json and create model
json_file = open("inception_feature_extractor.json", 'r')
inception_feature_extractor = json_file.read()
json_file.close()
inception_feature_extractor = model_from_json(inception_feature_extractor)
# load weights into new model
inception_feature_extractor.load_weights("inception_feature_extractor.h5")
print("Loaded model from disk")

Loaded model from disk


In [19]:
# Images of the training dataset to vectors
for sample in train_loader :
    np_image = sample['image'].numpy()
    #np_image = preprocess_input(np_image)
    image_vect = inception_feature_extractor.predict(np_image)
    print(image_vect.shape)

(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2

(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
Requested photo not available.
(3, 2048)
Requested photo not available.
(3, 2048)
(4, 2048)
(4, 2048)
(4, 2048)
(4, 