# Eksploracja Danych - Projekt
Tomasz Kiljańczyk (136257)

Wojciech Lulek (136280)

In [None]:
import pandas as pd
from tqdm.notebook import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
us_trending_df = pd.read_csv('./data/us_trending_stage_1.csv').head(1000)
us_trending_df

In [None]:
us_trending_df['thumbnail_link_hires'] = us_trending_df['thumbnail_link'].str.replace('default.jpg', '0.jpg')
urls = us_trending_df['thumbnail_link_hires']
unique_urls = us_trending_df['thumbnail_link_hires'].unique()
us_trending_df['thumbnail_link_hires']

In [None]:
from notebooks.workers import download_and_save
from multiprocessing import Pool
import os

if not os.path.exists('./images/'):
    os.makedirs('./images/')

with Pool(processes=os.cpu_count()) as pool:
    for _ in tqdm_notebook(pool.imap(download_and_save, unique_urls), total=unique_urls.size):
        pass


In [None]:
from notebooks.workers import extract_color_features
from PIL import Image

image_dataframe = pd.DataFrame(columns=[
    'thumbnail_med_hue',
    'thumbnail_med_saturation',
    'thumbnail_med_value',
    'thumbnail_avg_hue',
    'thumbnail_avg_saturation',
    'thumbnail_avg_value',
    'thumbnail_colorfulness'
])

results = []

with Pool(processes=os.cpu_count()) as pool:
    for res in tqdm_notebook(pool.imap(extract_color_features, unique_urls), total=unique_urls.size):
        results.append(res)

color_features_dict = dict(results)

color_features = [color_features_dict[url] for url in urls]

image_dataframe = image_dataframe.append(color_features, ignore_index=True)
image_dataframe

In [None]:
def get_concat_h(im1, im2):
    dst = Image.new('RGB', (im1.width + im2.width, im1.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst


for i, url in enumerate(urls.head(5)):
    file_path = f'./images/{url.split("/")[-2]}.jpg'

    image = Image.open(file_path)

    row = image_dataframe.iloc[i]
    median_hsv_color = (
        round(row['thumbnail_med_hue']),
        round(row['thumbnail_med_saturation']),
        round(row['thumbnail_med_value'])
    )
    avg_hsv_color = (
        round(row['thumbnail_avg_hue']),
        round(row['thumbnail_avg_saturation']),
        round(row['thumbnail_avg_value'])
    )

    dims = (round(image.width * 0.1), image.height)
    median_color_image = Image.new('HSV', dims, color=median_hsv_color).convert('RGB')
    average_color_image = Image.new('HSV', dims, color=avg_hsv_color).convert('RGB')

    image_result_1 = get_concat_h(image, median_color_image)
    image_result_1 = get_concat_h(image_result_1, average_color_image)

    display(image_result_1)

In [None]:
from notebooks.workers import detect_text

results = []
with Pool(processes=os.cpu_count()) as pool:
    for res in tqdm_notebook(pool.imap(detect_text, unique_urls), total=unique_urls.size):
        results.append(res)

text_features_dict = dict(results)

In [None]:
has_text = [text_features_dict.get(url, (False,))[0] for url in urls]
text_count = [text_features_dict.get(url, (None, 0))[1] for url in urls]
largest_text_to_image_area_ratio = [text_features_dict.get(url, (None, 0, 0))[2] for url in urls]

In [None]:
has_text_df = pd.DataFrame(has_text, columns=['thumbnail_has_text'])
has_text_df

In [None]:
text_count_df = pd.DataFrame(text_count, columns=['thumbnail_text_count'])
text_count_df

In [None]:
largest_text_to_image_area_ratio_df = pd.DataFrame(largest_text_to_image_area_ratio,
                                                   columns=['thumbnail_largest_text_to_image_area_ratio'])
largest_text_to_image_area_ratio_df

In [None]:
final_df = pd.concat([us_trending_df, image_dataframe], axis=1)
final_df = pd.concat([final_df, has_text_df], axis=1)
final_df = pd.concat([final_df, text_count_df], axis=1)
final_df = pd.concat([final_df, largest_text_to_image_area_ratio_df], axis=1)
final_df

In [None]:
corrMatrix = final_df.corr(method='spearman')
corrMatrix = corrMatrix.round(4)

mask = np.triu(np.ones_like(corrMatrix, dtype=bool))
np.fill_diagonal(mask, False)

fig, ax = plt.subplots(figsize=(30, 30))
sns.heatmap(corrMatrix, mask=mask, annot=True, fmt='g', ax=ax)
plt.show()

In [None]:
final_df.to_csv("./us_trending_stage_2_tk.csv", index=False)

# Using ImageAI for object detection

Trained model file is required! Put this into notebooks/models directory. Download link: https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/resnet50_coco_best_v2.1.0.h5/

Loading pre-trained model

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices('GPU'))

In [None]:
from imageai.Detection import ObjectDetection

detector = ObjectDetection()
detector.setModelTypeAsRetinaNet()
detector.setModelPath("models/resnet50_coco_best_v2.1.0.h5")
detector.loadModel()

Detecting objects from thumbnails

In [None]:
import pickle
import tqdm

dir_name = "images"
image_detections = dict()

for url in tqdm.tqdm(unique_urls):
    filename = f'./images/{url.split("/")[-2]}.jpg'
    detections = detector.detectObjectsFromImage(input_image=filename, output_type="array")[1]
    image_detections[url] = detections

with open("data/thumbnails_objects_dict.pkl", "wb") as f:
    pickle.dump(image_detections, f)

Loading detections from pickle file

In [None]:
import pickle

with open("data/thumbnails_objects_dict.pkl", "rb") as f:
    image_detections = dict(pickle.load(f))

Counting detected objects

In [None]:
import operator
from collections import defaultdict

image_objects_detected = defaultdict(dict)
counts = dict()

for key in image_detections:
    for obj in image_detections[key]:
        name = obj['name']
        image_objects_detected[key][name] = image_objects_detected[key].get(name, 0) + 1
        counts[name] = counts.get(name, 0) + 1

top_popular_objects = list(sorted(counts.items(), key=operator.itemgetter(1), reverse=True)[:10])

print("counts", counts)
print()
print("10 most common objects:", top_popular_objects)

Creating vector representation and saving it to data/ in CSV format

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(image_objects_detected).transpose()
df = df.fillna(0)
df = df.astype('int32')

df.to_csv("data/thumbnails_objects_vectors.csv")

Notatka: Może wagi wilkością bboxów

może rozpoznawanie emocji na twarzach