## Traitement de Données Massives

# Projet Partie n°1 

### Binôme : Sapy Oscar & Berthillon Mickaël

# 1 : Collecte de données

In [14]:
import requests
import os

# Define the API endpoint and your access key
url = "https://api.pexels.com/v1/search"
access_key = "MgxBjp4CnSFtTvWe3SeHtkTGp5oUOBWVG5S0PYZAd37hV3T3h2yY4PHp"

# Define the headers with your access key
headers = {
    "Authorization": access_key
}

# Define the path to the images folder
path = "./images"

# Define the number of images to download
num_images = 5

# Define the query parameters
query_params = {
    "query": "random", 
    "per_page": num_images
}

# Create the images folder if it doesn't exist
if not os.path.exists(path):
    os.makedirs(path)

# Send the API request and download the images
response = requests.get(url, headers=headers, params=query_params)
json = response.json()
for i in range(num_images):
    image_url = json["photos"][i]["src"]["original"]
    image_id = json["photos"][i]["id"]
    image_extension = ".jpg" # change extension as per your requirement
    image_filename = f"{image_id}{image_extension}"
    image_path = os.path.join(path, image_filename)

    # Download the image and save it to the images folder
    response = requests.get(image_url)
    with open(image_path, "wb") as f:
        f.write(response.content)


# 2 : Etiquetage et annotation

In [15]:
import os
import json
from PIL import Image
from PIL.ExifTags import TAGS
from PIL.TiffImagePlugin import IFDRational

class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, bytes):
            try:
                return obj.decode('utf-8')
            except UnicodeDecodeError:
                return obj.decode('utf-8', 'replace')
        elif isinstance(obj, IFDRational):
            return float(obj)
        return json.JSONEncoder.default(self, obj)

# Path to the folder containing pictures
path_to_folder = './images'

# Create the metadata folder if it doesn't exist
if not os.path.exists('metadata'):
    os.mkdir('metadata')

# Loop over all pictures in the folder
for filename in os.listdir(path_to_folder):
    if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
        # Open the image and get the EXIF data
        image = Image.open(os.path.join(path_to_folder, filename))
        exifdata = image.getexif()

        # Create a dictionary to store the metadata for this image
        metadata = {}

        # Loop over all EXIF tags and add them to the metadata dictionary
        for tag_id, value in exifdata.items():
            tag = TAGS.get(tag_id, tag_id)
            metadata[tag] = value

        # Write the metadata dictionary to a JSON file in the metadata folder
        json_filename = os.path.splitext(filename)[0] + '.json'
        with open(os.path.join('metadata', json_filename), 'w') as f:
            json.dump(metadata, f, cls=MyEncoder)


# 3 : Analyse de données

In [16]:
import os
import json
import torch
from PIL import Image
from PIL.ExifTags import TAGS
from transformers import ViTFeatureExtractor, ViTModel
import cv2
from colorthief import ColorThief
import pytesseract
from imageai.Detection import ObjectDetection
"""
# Download the YOLOv3 PyTorch model
url = "https://github.com/OlafenwaMoses/ImageAI/releases/download/essential-v4/pretrained-yolov3.h5"
response = requests.get(url)

with open("pretrained-yolov3.pt", "wb") as f:  
    f.write(response.content)

image_path = "./images"

# Initialize the ViT feature extractor and model
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
model = ViTModel.from_pretrained("google/vit-base-patch16-224")

# Initialize ImageAI object detection model
detector = ObjectDetection()
detector.setModelTypeAsYOLOv3()
detector.setModelPath("pretrained-yolov3.pt")
detector.loadModel()"""


import torchvision.transforms as T
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# Initialize torchvision object detection model
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

image_path = "./images"

# Function to get image metadata
def get_image_metadata(image_path):
    metadata = {}

    # Read the image using PIL and OpenCV
    pil_image = Image.open(image_path)
    cv_image = cv2.imread(image_path)

    # Get dominant color and color palette
    color_thief = ColorThief(image_path)
    dominant_color = color_thief.get_color(quality=1)
    color_palette = color_thief.get_palette(color_count=5)

    # Get orientation
    exif_data = pil_image._getexif()
    orientation = exif_data[274] if exif_data and 274 in exif_data else "unknown"


    # Get number of faces
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
    gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=3)
    num_faces = len(faces)

    # Get text
    pil_image_rgb = pil_image.convert('RGB')
    text = pytesseract.image_to_string(pil_image_rgb)

    # Get objects
    transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()])
    img = transform(Image.open(image_path))
    img = img.unsqueeze(0)
    detections = model(img)

    objects = []
    for label, score in zip(detections[0]["labels"], detections[0]["scores"]):
        if score > 0.5:
            objects.append(label.item())


    # Build metadata dictionary
    metadata = {
        "orientation": orientation,
        "dominant_color": dominant_color,
        "color_palette": color_palette,
        "num_faces": num_faces,
        "text": text,
        "objects": objects,
    }

    return metadata

# Loop over all pictures in the folder
for filename in os.listdir(path_to_folder):
    if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"):
        # Get the image metadata
        image_path = os.path.join(path_to_folder, filename)
        metadata = get_image_metadata(image_path)

        # Write the metadata dictionary to a JSON file in the metadata folder
        with open(os.path.join("metadata", os.path.splitext(filename)[0] + ".json"), "w") as f:
            json.dump(metadata, f)






# 4 : Visualisation de données

In [17]:
import json
import os
from collections import defaultdict

# Read the user's preferences from the preferences.json file
with open("preferences.json", "r") as f:
    preferences = json.load(f)

liked_images = []

# Verify if the image metadata file exists and add it to the liked_images list
for key, value in preferences.items():
    if value and os.path.exists(os.path.join("images", value)):
        liked_images.append(key)  # Append the key (image identifier) instead of the value (filename)
        print(f"Image {key} with metadata file {value} found and added to liked images...")
    else:
        print(f"Image {key} with metadata file {value} not found. Skipping...")

print(liked_images)


Image image1 with metadata file 1000366.jpg found and added to liked images...
Image image2 with metadata file  not found. Skipping...
Image image3 with metadata file  not found. Skipping...
Image image4 with metadata file  not found. Skipping...
Image image5 with metadata file  not found. Skipping...
Image image6 with metadata file  not found. Skipping...
Image image7 with metadata file  not found. Skipping...
Image image8 with metadata file  not found. Skipping...
Image image9 with metadata file  not found. Skipping...
Image image10 with metadata file  not found. Skipping...
Image image11 with metadata file  not found. Skipping...
Image image12 with metadata file  not found. Skipping...
Image image13 with metadata file  not found. Skipping...
['image1']


In [36]:
import json
import os

# Read the user's preferences from the preferences.json file
with open("preferences.json", "r") as f:
    preferences = json.load(f)

liked_images = []

# Verify if the image metadata file exists and add it to the liked_images list
for key, value in preferences.items():
    if value and os.path.exists(os.path.join("metadata", os.path.splitext(value)[0] + ".json")):
        liked_images.append(value)

preferred_tags = []

# Loop through the liked_images list and get the tags from the metadata files
for image in liked_images:
    metadata_file = os.path.join("metadata", os.path.splitext(image)[0] + ".json")
    with open(metadata_file, "r") as f:
        metadata = json.load(f)
    
    # Add the tags to the preferred_tags array
    for tag, value in metadata.items():
        preferred_tags.append((tag, value))

print(preferred_tags)


[('orientation', 'unknown'), ('dominant_color', [194, 196, 190]), ('color_palette', [[194, 196, 190], [59, 73, 80], [147, 137, 122], [116, 82, 84], [127, 120, 63]]), ('num_faces', 18), ('text', ' \n\x0c'), ('objects', [])]


In [39]:
import json
import os

# Read the user's preferences from the preferences.json file
with open("preferences.json", "r") as f:
    preferences = json.load(f)

liked_images = []

# Verify if the image metadata file exists and add it to the liked_images list
for key, value in preferences.items():
    if value and os.path.exists(os.path.join("images", value)):
        liked_images.append(key)

liked_images_filenames = [preferences[key] for key in liked_images]

preferred_tags = []
for image in liked_images:
    with open(os.path.join("metadata", preferences[image].split('.')[0] + ".json"), "r") as f:
        metadata = json.load(f)
    for key, value in metadata.items():
        preferred_tags.append((key, value))

similarity_scores = {}

# Loop through all metadata files in the metadata folder
for metadata_file in os.listdir("metadata"):
    image_filename = os.path.splitext(metadata_file)[0] + ".jpg"
    
    # Check if the current image is in the liked_images_filenames list
    if image_filename not in liked_images_filenames:
        with open(os.path.join("metadata", metadata_file), "r") as f:
            metadata = json.load(f)

        # Calculate the similarity score
        similarity_score = 0
        for key, value in preferred_tags:
            if key in metadata and metadata[key] == value:
                similarity_score += 1

        # Add the similarity score to the dictionary
        if similarity_score > 0:
            similarity_scores[image_filename] = similarity_score

print(similarity_scores)


{'4194850': 2, '4226881': 2, '1000366': 6, '3737018': 2, '3844788': 1}
