<span style="color: #2486c7; font-family: Segoe UI; font-size: 1.5em; font-weight: 300; font-size: 32px">PetFinder - EDA + Resized Training Images 🛠️</span>

<div style="font-family: Segoe UI; font-size: 1.5em; font-weight: 300; font-size: 16px">
Currently, PetFinder.my uses a basic Cuteness Meter to rank pet photos. It analyzes picture composition and other factors compared to the performance of thousands of pet profiles. While this basic tool is helpful, it's still in an experimental stage and the algorithm could be improved.
<br><br>
In this competition, we’ll analyze raw images and metadata to <strong>predict the “Pawpularity” of pet photos</strong>. We'll train and test your model on PetFinder.my's thousands of pet profiles. Winning versions will offer accurate recommendations that will improve animal welfare.
</div>

In [None]:
%%sh
pip install -q rich dabl

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import cv2
import dabl
import shutil
import glob
from tqdm.notebook import tqdm
from rich import print as _pprint
from PIL import Image, ImageChops
from joblib import Parallel, delayed

import warnings
warnings.simplefilter('ignore')

In [None]:
def cprint(string:str, end="\n"):
    """
    A little utility function for printing and stuff
    """
    _pprint(f"[black]{string}[/black]", end=end)

In [None]:
train_file = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
test_file = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
sample_sub = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")

In [None]:
train_file.head()

In [None]:
train_file.describe()

In [None]:
test_file.head()

In [None]:
test_file.describe()

<span style="color: #2130b8; font-family: Segoe UI; font-size: 1.5em; font-weight: 300; font-size: 28px">EDA + Data Preprocessing</span>

In [None]:
plt.figure(figsize=(7, 7))
plt.title(f"Pawpularity Distribution")
sns.histplot(train_file['Pawpularity'], stat='density')
sns.kdeplot(train_file['Pawpularity'], color='blue')
plt.axvline(train_file['Pawpularity'].mean(), color='red', linestyle='--', linewidth=0.8)
min_ylim, max_ylim = plt.ylim()
plt.text(train_file['Pawpularity'].mean()*1.05, max_ylim*0.96, 'Mean (μ): {:.2f}'.format(train_file['Pawpularity'].mean()))
plt.xlabel("Pawpularity Score")
plt.ylabel("Density")
plt.show()

In [None]:
cprint(f"[bold]Max Pawpularity Score:[/bold] [green]{train_file['Pawpularity'].max()}[/green]")
cprint(f"[bold]Min Pawpularity Score:[/bold] [red]{train_file['Pawpularity'].min()}[/red]")
cprint(f"[bold]Average Pawpularity Score:[/bold] [blue]{train_file['Pawpularity'].mean():.2f}[/blue]")

In [None]:
plt.style.use('ggplot')
tr = train_file.drop(['Id'], axis=1)
ret = dabl.plot(tr, target_col='Pawpularity')

In [None]:
train_file_names = glob.glob("../input/petfinder-pawpularity-score/train/*.jpg")
test_file_names = glob.glob("../input/petfinder-pawpularity-score/test/*.jpg")

cprint(f"Train Images Count: [green]{len(train_file_names)}[/green]")
cprint(f"Test Images Count: [green]{len(test_file_names)}[/green]")

<span style="color: #2130b8; font-family: Segoe UI; font-size: 1.5em; font-weight: 300; font-size: 28px">Fast Image Resizing</span>

In [None]:
%%sh
mkdir "/kaggle/working/train_224/"
mkdir "/kaggle/working/test_224/"

mkdir "/kaggle/working/train_512/"
mkdir "/kaggle/working/test_512/"

In [None]:
def resizeImage(imagePath, outputFolder, resize=224):
    """
    Function to resize Image using cv2
    """
    img = cv2.imread(imagePath)
    img = img[:, :, ::-1]
    img = cv2.resize(img, (resize, resize))
    imgPath = os.path.join(outputFolder, os.path.basename(imagePath))
    cv2.imwrite(imgPath, img)

In [None]:
# Run in Parallel on 16 cores for quicky quick resizing and saving - 224 x 224 px
_ = Parallel(n_jobs=16, verbose=0)(delayed(resizeImage)(fileName, "/kaggle/working/train_224") for fileName in tqdm(train_file_names))
_ = Parallel(n_jobs=16, verbose=0)(delayed(resizeImage)(fileName, "/kaggle/working/test_224") for fileName in tqdm(test_file_names))

In [None]:
# Run in Parallel on 16 cores for quicky quick resizing and saving - 512 x 512 px
_ = Parallel(n_jobs=16, verbose=1)(delayed(resizeImage)(fileName, "/kaggle/working/train_512", 512) for fileName in train_file_names)
_ = Parallel(n_jobs=16, verbose=1)(delayed(resizeImage)(fileName, "/kaggle/working/test_512", 512) for fileName in test_file_names)

In [None]:
shutil.make_archive("/kaggle/working/train_512", 'zip', "/kaggle/working/train_512")
shutil.make_archive("/kaggle/working/test_512", 'zip', "/kaggle/working/test_512")
shutil.make_archive("/kaggle/working/train_224", 'zip', "/kaggle/working/train_224")
shutil.make_archive("/kaggle/working/test_224", 'zip', "/kaggle/working/test_224")

In [None]:
%%sh
rm -rf "/kaggle/working/train_224/"
rm -rf "/kaggle/working/test_224/"
rm -rf "/kaggle/working/train_512/"
rm -rf "/kaggle/working/test_512/"