# Eksploracja Danych - Projekt
Tomasz Kiljańczyk (136257)

Wojciech Lulek (136280)

In [None]:
import json

import numpy as np
import pandas as pd
import tqdm

NON_TRENDING_FILE = "./data/stage_5_non_trending_details.csv"

In [None]:
us_non_trending_df = pd.read_csv(NON_TRENDING_FILE, lineterminator='\n', parse_dates=["publishedAt"])
us_non_trending_df.rename(columns={'description\r': 'description'}, inplace=True)
us_non_trending_df['description'] = us_non_trending_df['description'].str.rstrip('\r')
us_non_trending_df['tags'].replace(np.NaN, "", inplace=True)

us_non_trending_df["categoryId"] = us_non_trending_df["categoryId"].astype(int)

us_non_trending_df.head()

In [None]:
us_non_trending_df.info()

In [None]:
us_non_trending_df.drop(us_non_trending_df[us_non_trending_df['view_count'] == 0].index, inplace=True)

## Text features

### Title and description

In [None]:
with open("./data/top_words.json", mode="r") as file:
    top_words: list = json.loads(file.read())

titles_with_descriptions = us_non_trending_df['title'].str.cat(us_non_trending_df['description'], sep=' ')
for word in top_words:
    us_non_trending_df[f'word_{word}'] = titles_with_descriptions.str.contains(word, regex=False)

us_non_trending_df.head()

### Tags

In [None]:
with open("./data/top_tags.json", mode="r") as file:
    top_tags: list = json.loads(file.read())

for tag in top_tags:
    us_non_trending_df[f'tag_{tag}'] = us_non_trending_df['tags'].str.contains(tag, regex=False)

us_non_trending_df.head()

### Other text features

In [None]:
PUNCTUATION_REGEX = r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]'

us_non_trending_df['title_length'] = us_non_trending_df['title'].str.len()
us_non_trending_df['title_capital_letters'] = us_non_trending_df['title'].str.count(r'[A-Z]') / us_non_trending_df[
    'title_length']
us_non_trending_df['title_punctuation'] = us_non_trending_df['title'].str.count(PUNCTUATION_REGEX) / us_non_trending_df[
    'title_length']
us_non_trending_df['description_height'] = us_non_trending_df['description'].str.count(r'[\r\n]')
us_non_trending_df['description_length'] = us_non_trending_df['description'].str.len()
us_non_trending_df['description_link_count'] = us_non_trending_df['description'].str.count(r'://')
us_non_trending_df['tag_count'] = us_non_trending_df['tags'].apply(
    lambda tags: 0 if tags == '' else tags.count('|') + 1)

### Cleanup

In [None]:
us_non_trending_df.drop(["title", "tags", "description"], axis=1, inplace=True)

In [None]:
unique_urls = us_non_trending_df['thumbnail_link'].unique()
urls = us_non_trending_df['thumbnail_link']

## Date features

In [None]:
us_non_trending_df['publishedAtHour'] = us_non_trending_df['publishedAt'].dt.hour
us_non_trending_df['publishedAtDay'] = us_non_trending_df['publishedAt'].dt.weekday
us_non_trending_df['publishedAtMonth'] = us_non_trending_df['publishedAt'].dt.month

### Holidays

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
min_date = us_non_trending_df['publishedAt'].min()
max_date = us_non_trending_df['publishedAt'].max()

holidays = cal.holidays(start=min_date, end=max_date)
us_non_trending_df['publishedAtHoliday'] = us_non_trending_df['publishedAt'].apply(lambda date: date in holidays)

us_non_trending_df[us_non_trending_df['publishedAtHoliday']].head()

## Thumbnail features

In [None]:
from workers import download_and_save
from multiprocessing import Pool
import os

if not os.path.exists('./images/'):
    os.makedirs('./images/')

with Pool(processes=os.cpu_count()) as pool:
    for _ in tqdm.tqdm(pool.imap(download_and_save, unique_urls), total=unique_urls.size):
        pass


### Colors and text

#### Colors

In [None]:
from workers import extract_color_features

results = []

with Pool(processes=os.cpu_count()) as pool:
    for res in tqdm.tqdm(pool.imap(extract_color_features, unique_urls), total=unique_urls.size):
        results.append(res)

color_features_dict = dict(results)

color_features = [color_features_dict[url] for url in urls]

image_dataframe = pd.DataFrame(color_features)
image_dataframe.head()

#### Text

In [None]:
from workers import detect_text

results = []
with Pool(processes=os.cpu_count()) as pool:
    for res in tqdm.tqdm(pool.imap(detect_text, unique_urls), total=unique_urls.size):
        results.append(res)

text_features_dict = dict(results)

In [None]:
has_text = [text_features_dict.get(url, (False,))[0] for url in urls]
text_count = [text_features_dict.get(url, (None, 0))[1] for url in urls]
largest_text_to_image_area_ratio = [text_features_dict.get(url, (None, 0, 0))[2] for url in urls]

In [None]:
has_text_df = pd.DataFrame(has_text, columns=['thumbnail_has_text'])
has_text_df.head()

In [None]:
text_count_df = pd.DataFrame(text_count, columns=['thumbnail_text_count'])
text_count_df.head()

In [None]:
largest_text_to_image_area_ratio_df = pd.DataFrame(largest_text_to_image_area_ratio,
                                                   columns=['thumbnail_largest_text_to_image_area_ratio'])
largest_text_to_image_area_ratio_df.head()

#### Merge

In [None]:
us_non_trending_df = pd.concat(
    [us_non_trending_df, image_dataframe, has_text_df, text_count_df, largest_text_to_image_area_ratio_df], axis=1)
us_non_trending_df.drop(['thumbnail_link'], axis=1, inplace=True)
us_non_trending_df.head()

### Faces

In [None]:
from workers import extract_face_features

results = []

for res in tqdm.tqdm(iter(extract_face_features(url) for url in unique_urls), total=unique_urls.size):
    results.append(res)

face_features_dict = dict(results)

face_features = [face_features_dict[url] for url in urls]
face_features_df = pd.DataFrame(face_features)
face_features_df

In [None]:
us_non_trending_df = pd.concat([us_non_trending_df, face_features_df], axis=1)

### Objects

In [None]:
from imageai.Detection import ObjectDetection

detector = ObjectDetection()
detector.setModelTypeAsRetinaNet()
detector.setModelPath("models/resnet50_coco_best_v2.1.0.h5")
detector.loadModel()

In [None]:
from workers import file_path_from_url
import tqdm

image_detections = dict()

for i, url in enumerate(tqdm.tqdm(unique_urls)):
    filename = file_path_from_url(url)
    detections = detector.detectObjectsFromImage(input_image=filename, output_type="array")[1]
    image_detections[url] = detections

In [None]:
from collections import defaultdict, Counter

image_objects_detected = defaultdict(dict)
cnt = Counter()

for key in image_detections:
    for obj in image_detections[key]:
        name = obj['name']
        image_objects_detected[key][name] = image_objects_detected[key].get(name, 0) + 1
        cnt.update({name: 1})

In [None]:
import pandas as pd

image_objects_detected_full = [image_objects_detected[url] for url in urls]

objects_df = pd.DataFrame(image_objects_detected_full)
objects_df.fillna(0, inplace=True)
objects_df = objects_df.astype('int32')
objects_df.head()

In [None]:
us_non_trending_df = pd.concat([us_non_trending_df, objects_df], axis=1)
us_non_trending_df.head()

## Save results

In [None]:
us_non_trending_df.to_csv("./data/stage_5_us_non_trending.csv", index=False)
us_non_trending_df.shape