In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))
TRAIN_IMG_PASS = '../input/petfinder-pawpularity-score/train'

### Problem statement
Predict the pawpularity score of pet images.

In [None]:
df_train = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
df_test = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')

# Check distribution of pawpularity scores

In [None]:
fig = plt.figure(figsize=(15,5), dpi=150)
sns.histplot(data=df_train, x='Pawpularity', bins=100)
plt.axvline(df_train['Pawpularity'].mean(), ls='--', c='red', lw=3, label='Mean')
plt.axvline(df_train['Pawpularity'].median(), ls='--', c='blue',lw=3, label='Median')
plt.title('Pawpularity Scores', fontsize=20, fontweight='bold')
plt.legend()
plt.show()

In [None]:
def plot_img(min, max):
    less10 = train.query(f'{min} <= Pawpularity < {max}')
    sample_less10 = less10.sample(n=5).reset_index()

    fig = plt.figure(figsize=(30,30))
    for index, row in sample_less10.iterrows():
        title = str(row['Pawpularity'])
        img = plt.imread(os.path.join(TRAIN_IMG_PASS, row['Id'] + '.jpg'))
        plt.subplot(1,5,index+1)
        plt.title(f'Pawpularity:{title}', fontsize=18)
        plt.imshow(img)
    plt.show()

# Check images 🐶🐱
Let's have a look at what type of data we are actually working on.

# ANNND THIS is a lovely PUGG

In [None]:
img = plt.imread('/kaggle/input/petfinder-pawpularity-score/train/1ade125ab98dcbe2963b5a92c0dd0416.jpg')
plt.imshow(img)
plt.show()

In [None]:
def plot_img(min_score, max_score):
    less10 = df_train.query(f'{min_score} <= Pawpularity < {max_score}')
    sample_less10 = less10.sample(n=5).reset_index()

    fig = plt.figure(figsize=(30,30))
    for index, row in sample_less10.iterrows():
        title = str(row['Pawpularity'])
        img = plt.imread(os.path.join(TRAIN_IMG_PASS, row['Id'] + '.jpg'))
        plt.subplot(1,5,index+1)
        plt.title(f'Pawpularity:{title}', fontsize=18)
        plt.imshow(img)
    plt.show()

# Understanding different ranges of `Pawpularity` representatives

## Low Pawpularity (1-10)

In [None]:
plot_img(0, 10)

## Intermediate Pawpularity (20 - 40)

In [None]:
plot_img(20, 40)

## Upper Intermediate Pawpularity (40 - 60)

In [None]:
plot_img(40, 60)

## High  Pawpularity (60 - 90)

In [None]:
plot_img(60, 90)

## Maximum Pawpularity (90 - 100)

In [None]:
plot_img(90, 100)

### Conclusion

From my perspective, some of the pets who have low popularity are quite nice though other pets with high popularity do not have that great appearance.
So from a human perspective, it's hard to determine by just looking at the picture if the pet is going to be popular or not. One more notice is that popularity might be mostly determined by the unusual picture. Once the user saw something unusual in the picture, it might lead to a growth of click rate independently of pet appearance.

# Making first predictions based on metadata & AutoML

In [None]:
!pip install flaml --quiet
import flaml
from flaml import AutoML
import warnings
warnings.filterwarnings('ignore')

### Bin the pawpularity score among a list of values ranging from 0 to 100
Labels is a list of pawpularity score values ranging from 1 to 10

In [None]:
feature_cols=df_train.columns[1:13]
train_set = np.array(df_train[feature_cols])
test_set = np.array(df_test[feature_cols])
df_train['label'] = pd.cut(np.array(df_train['Pawpularity']),
                          bins = list(range(0,110,10)),
                          labels = list(range(0,10)))

In [None]:
df_train.head()

### Here we will try not predict the target variable itself, but to classify it in the bins created previously

In [None]:
paw_values = []
for i in range(0,10):
    v = list(df_train[df_train['label'] == i]['Pawpularity'])
    paw_values.append(np.mean(v))
ytrain = np.array(df_train['label'])

### Training Classification Model (classify bins 0 .. 10)

In [None]:
clf = AutoML()
settings = {
    'estimator_list':['xgboost','rf','lgbm','catboost'],
    'log_file_name':'pp.log',
    'task':'classification',
    'metric':'accuracy',
    'time_budget':360,
    'seed': 1000
}
clf.fit(train_set, ytrain, **settings)

In [None]:
pred = clf.predict(test_set)

### Iterate through the paw values list and  update the final predicted pawpularity scores
Converting cztegories (bins) back to scores

In [None]:
pred_set = [paw_values[i] for i in pred]

### Making Submission

In [None]:
pred_df = pd.DataFrame.from_dict({
    "Id":list(df_test["Id"]),
    "Pawpularity":pred_set
})
pred_df.to_csv('./submission.csv',index=False)
pred_df.head(5)