# Let's use SBER AutoML lib 
> Only for this you can give me an upvote 🌟 

> And feel free to comment your opinion and anything you want :) 

> This will motivate me to do more experiments :)

# ChangeLog
* v1 - install, import, quick EDA, train, predict, submit, Error with offline lib
* v2 - changed offline lib 
* v3 - forget to pip install offline packages
* v4 - look for duplicates in image and pawpularity
* v5 - added meta features from [here](https://www.kaggle.com/nexus6roy/extract-extra-data-from-image) and remove duplicates taken from [here](https://www.kaggle.com/yingpengchen/find-duplicate-images)
* v6 - hide large output, remove meta features, change the print from previous notebook

In [None]:
# if we have internet connection
#!pip install lightautoml -q

# else
!tar xvfz ../input/lightautoml-tar/lightautoml.tgz > /dev/null

In [None]:
#Install the offline packages into our kernel
!pip install lightautoml --no-index --find-links=file:./lightautoml/  -q

# Standard libs

In [None]:
# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Imports from LightAutoML package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from pandas_profiling import ProfileReport

import cv2
import datetime
import gc
import glob
import imagehash
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
import tqdm
import PIL
from keras.preprocessing import image

# Data

In [None]:
DATA_DIR = '../input/petfinder-pawpularity-score/'
train_data = pd.read_csv(DATA_DIR + 'train.csv')
test_data = pd.read_csv(DATA_DIR + 'test.csv')
sample_submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')
submission = pd.read_csv(DATA_DIR+'sample_submission.csv')

X_test = test_data.values

# Look for Duplicates and remove
> [Thanks to](https://www.kaggle.com/yingpengchen/find-duplicate-images)

In [None]:
def images_find_duplicates(image_files, threshold=0.9):
    """
    Function to find duplicates in images.
    References: https://www.kaggle.com/appian/let-s-find-out-duplicate-images-with-imagehash
    Args:
        image_files:
        threshold:

    Returns:

    """
    funcs = [imagehash.average_hash, imagehash.phash, imagehash.dhash, imagehash.whash]
    image_ids = image_files
    hashes = []
    for file in tqdm.tqdm(image_files):
        image = PIL.Image.open(file)
        hashes.append(np.array([f(image).hash for f in funcs]).reshape(256))
    hashes_all = np.array(hashes)

    # Comparisons without Pytorch
    sim_list = []
    for i in tqdm.tqdm(range(hashes_all.shape[0])):
        sim_list.append(np.sum(hashes_all[i] == hashes_all, axis=1)/256)

    # nxn-matrix of similarities (n = # of images), upper triangular matrix
    similarities = np.triu(np.array(sim_list), 1)

    idx_pair = np.where(similarities > threshold)
    df_pairs = pd.DataFrame({'image1': [image_ids[i] for i in list(idx_pair[0])],
                             'image2': [image_ids[i] for i in list(idx_pair[1])],
                             'similarity': [similarities[i1, i2] for i1, i2 in zip(idx_pair[0], idx_pair[1])]})

    idx_group = np.zeros(len(image_files))
    group_id = 1
    for i1, i2 in zip(idx_pair[0], idx_pair[1]):
        if idx_group[i1] == 0 and idx_group[i2] == 0:
            idx_group[i1] = group_id
            idx_group[i2] = group_id
            group_id += 1
        elif idx_group[i1] != 0 and idx_group[i2] == 0:
            idx_group[i2] = idx_group[i1]
        elif idx_group[i1] == 0 and idx_group[i2] != 0:
            idx_group[i1] = idx_group[i2]
        elif idx_group[i1] != 0 and idx_group[i2] != 0 and idx_group[i1] != idx_group[i2]:
            common_id = min(idx_group[i1], idx_group[i2])
            idx_group[idx_group == idx_group[i1]] = common_id
            idx_group[idx_group == idx_group[i2]] = common_id

    group_list = []
    for i in range(1, group_id + 1):
        group_ids = list(np.where(idx_group == i)[0])
        if len(group_ids) > 0:
            group_list.append([image_ids[j] for j in group_ids])

    return df_pairs, group_list

In [None]:
train_files = []
for image in train_data['Id']:
    image_path = f'{DATA_DIR}/train/{image}.jpg'
    train_files.append(image_path)
print(f'Number of Petfinder training files: {len(train_files)}')

total_files = []
total_files.extend(train_files)

df_pairs, group_list = images_find_duplicates(total_files, threshold=0.90)

print(f'\nNumber of duplicate pairs: {len(df_pairs)}')

ids_to_delete = []
for path1,path2 in zip(df_pairs['image1'],df_pairs['image2']):
    image_id1 = path1.split('/')[-1].split('.')[0]
    
    #print(train_data[train_data.Id == image_id1])
    ids_to_delete.append(image_id1)
print("Size of DF before deleting duplicates",len(train_data))

ids_to_delete = list(set(ids_to_delete))
print("Duplicates count = ", len(ids_to_delete))

for ids in ids_to_delete:
    train_data = train_data[train_data.Id != ids]
print("Size of DF after removing Duplicates",len(train_data))

# Quick EDA

In [None]:
%%time
profile = ProfileReport(train_data, title="Pandas Profiling Report")
profile

In [None]:
profile.to_file("PetFinder Meta features.html")

In [None]:
train_data.sample(5)

In [None]:
y = train_data.Pawpularity.values
X = train_data.drop(['Pawpularity'], axis=1).values

# Spilit

In [None]:
%%time

tr_data, valid_data = train_test_split(train_data, test_size=0.2,random_state=42)

# Create Task object
> Below this line we are ready to build the model for Price target variable prediction. First of all, we setup the type of model we need using LightAutoML Task class object, there the valid values can be:

* ‘binary’ for binary classification
* ‘reg’ for regression and
* ‘multiclass’ for multiclass classification

In [None]:
task = Task('reg', metric='mse',greater_is_better=False, loss='mse')

In [None]:
%%time

roles = {'target': 'Pawpularity','drop': 'id','category':'format'}

## Train of dataset
> Now we know what model to use to receive good results on the dataset

In [None]:
%%time
automl = TabularUtilizedAutoML(task = task, 
                       timeout = 10800, # 3 hours
                       cpu_limit = 4, # Optimal for Kaggle kernels
                       general_params = {'use_algos': [['linear_l2', 'lgb_tuned','cb_tuned']]})

In [None]:
%%time
oof_pred = automl.fit_predict(tr_data, roles = roles)

# Check Valid

In [None]:
valid_pred = automl.predict(valid_data)

# OOF score

In [None]:
print('OOF RMSE: {}'.format(mean_squared_error(tr_data['Pawpularity'].values, oof_pred.data[:, 0],squared=False)))
print('VAL RMSE: {}'.format(mean_squared_error(valid_data['Pawpularity'].values, valid_pred.data[:, 0],squared=False)))

# Test prediction

In [None]:
test_pred = automl.predict(test_data)

# Submission

In [None]:
submission['Pawpularity'] = (test_pred.data[:, 0]).astype(int)
submission.to_csv('submission.csv', index = False)

# EOF