# **About author: I'm a beginner in this field trying to learn and discovering the enjoyment of Data Science.**
### Note1: This notebook is a copy version plus some editing and experimenting for my own understanding and learning.
### Note2: If this notebook is useful for you in anyway, please give an upvote or commenting your gratitude on the notebook in the reference section. 

# Import

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
from timm import create_model
import pandas as pd
from torch import tensor
import numpy as np

In [None]:
from fastai.vision.all import *

# Constants

In [None]:
set_seed(365, reproducible=True)
BATCH_SIZE = 32

# Datasets

In [None]:
dataset_path = Path('../input/petfinder-pawpularity-score-clean/')
dataset_path.ls()

In [None]:
train_df = pd.read_csv(dataset_path/'train.csv')
train_df.head()

In [None]:
train_df['path'] = train_df['Id'].map(lambda x:str(dataset_path/'train'/x)+'.jpg')
train_df = train_df.drop(columns=['Id'])
train_df = train_df.sample(frac=1).reset_index(drop=True) #shuffle dataframe
train_df.head()

In [None]:
len_df = len(train_df)
print(f"There are {len_df} images")

In [None]:
train_df['Pawpularity'].hist(figsize = (10, 5))
print(f"The mean Pawpularity score is {train_df['Pawpularity'].mean()}")
print(f"The median Pawpularity score is {train_df['Pawpularity'].median()}")
print(f"The standard deviation of the Pawpularity score is {train_df['Pawpularity'].std()}")

In [None]:
print(f"There are {len(train_df['Pawpularity'].unique())} unique values of Pawpularity score")

In [None]:
train_df['norm_score'] = train_df['Pawpularity']/100
train_df['norm_score']

In [None]:
im = Image.open(train_df['path'][1])
width, height = im.size
print(width,height)

In [None]:
im

### Swin-Transformer document [click here](https://github.com/microsoft/Swin-Transformer)

In [None]:
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
    os.makedirs('/root/.cache/torch/hub/checkpoints/')
!cp '../input/swin-transformer/swin_large_patch4_window7_224_22kto1k.pth' '/root/.cache/torch/hub/checkpoints/swin_large_patch4_window7_224_22kto1k.pth'

In [None]:
seed=365
set_seed(seed, reproducible=True)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms = True

# Find optimal number of bins

### source for optimal bins formula [click here](https://www.statology.org/sturges-rule/)

In [None]:
import math
#Rice rule
num_bins = int(np.ceil(2*((len(train_df))**(1./3))))
num_bins

In [None]:
train_df['bins'] = pd.cut(train_df['norm_score'], bins=num_bins, labels=False)
train_df['bins'].hist()

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

train_df['fold'] = -1


N_FOLDS = 10
strat_kfold = StratifiedKFold(n_splits=N_FOLDS, random_state=seed, shuffle=True)
for i, (_, train_index) in enumerate(strat_kfold.split(train_df.index, train_df['bins'])):
    train_df.iloc[train_index, -1] = i
    
train_df['fold'] = train_df['fold'].astype('int')

train_df.fold.value_counts().plot.bar()

In [None]:
train_df[train_df['fold']==0].head()

In [None]:
train_df[train_df['fold']==0]['bins'].value_counts()

In [None]:
train_df[train_df['fold']==1]['bins'].value_counts()

In [None]:
def petfinder_rmse(input,target):
    return 100*torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))

# Data loading

In [None]:
def get_data(fold):
#     train_df_no_val = train_df.query(f'fold != {fold}')
#     train_df_val = train_df.query(f'fold == {fold}')
    
#     train_df_bal = pd.concat([train_df_no_val,train_df_val.sample(frac=1).reset_index(drop=True)])
    train_df_f = train_df.copy()
    # add is_valid for validation fold
    train_df_f['is_valid'] = (train_df_f['fold'] == fold)
    
    dls = ImageDataLoaders.from_df(train_df_f, #pass in train DataFrame
#                                valid_pct=0.2, #80-20 train-validation random split
                               valid_col='is_valid', #
                               seed=365, #seed
                               fn_col='path', #filename/path is in the second column of the DataFrame
                               label_col='norm_score', #label is in the first column of the DataFrame
                               y_block=RegressionBlock, #The type of target
                               bs=BATCH_SIZE, #pass in batch size
                               num_workers=8,
                               item_tfms=Resize(224), #pass in item_tfms
                               batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Hue(), Saturation()])) #pass in batch_tfms
    
    return dls

In [None]:
# #Valid Kfolder size
# the_data = get_data(0)
# assert (len(the_data.train) + len(the_data.valid)) == (len(train_df)//BATCH_SIZE)

In [None]:
A = get_data(0).valid.dataset
valid = [A[i][1] for i in range(len(A))] 
assert len(valid) == len(A)

# Model

In [None]:
def get_learner(fold_num):
    data = get_data(fold_num)
    
    model = create_model('swin_large_patch4_window7_224', pretrained=True, num_classes=data.c)

    learn = Learner(data, model, loss_func=BCEWithLogitsLossFlat(), metrics=petfinder_rmse).to_fp16()
    
    return learn,data

In [None]:
test_df = pd.read_csv(dataset_path/'test.csv')
test_df.head()

In [None]:
test_df['Pawpularity'] = [1]*len(test_df)
test_df['path'] = test_df['Id'].map(lambda x:str(dataset_path/'test'/x)+'.jpg')
test_df = test_df.drop(columns=['Id'])
train_df['norm_score'] = train_df['Pawpularity']/100

In [None]:
# get_learner(fold_num=0).lr_find(end_lr=3e-2)

In [None]:
import gc

In [None]:
!pwd

In [None]:
all_preds = []

for i in range(1):

    print(f'Fold {i} results')
    
    learn,train_data = get_learner(fold_num=i)

    learn.fit_one_cycle(5, 2e-5, cbs=[SaveModelCallback(), EarlyStoppingCallback(monitor='petfinder_rmse', comp=np.less, patience=2)]) 
    
    learn.recorder.plot_loss()
    
    A = train_data.valid.dataset
    valid = [A[i][1] for i in range(len(A))] 
    assert len(valid) == len(A)
    
    valid_dls = train_data.valid
    val_preds, _ = learn.tta(dl=valid_dls, n=5, beta=0)
    valid_df = pd.DataFrame({'preds':val_preds.to('cpu').numpy().reshape(1,-1).flatten(),'actual':np.array([valid[i].item() for i in range(len(valid))])})
    print(valid_df)
    valid_df.to_csv(f'learner_1_validataion_fold{i}.csv')
    #learn = learn.to_fp32()
    
    #learn.export(f'model_fold_{i}.pkl')
    #learn.save(f'model_fold_{i}.pkl')
    
    dls = ImageDataLoaders.from_df(train_df, #pass in train DataFrame
                               valid_pct=0.2, #80-20 train-validation random split
                               seed=365, #seed
                               fn_col='path', #filename/path is in the second column of the DataFrame
                               label_col='norm_score', #label is in the first column of the DataFrame
                               y_block=RegressionBlock, #The type of target
                               bs=BATCH_SIZE, #pass in batch size
                               num_workers=8,
                               item_tfms=Resize(224), #pass in item_tfms
                               batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Hue(), Saturation()])) 
    
    test_dl = dls.test_dl(test_df)
    
    preds, _ = learn.tta(dl=test_dl, n=5, beta=0)
    
    all_preds.append(preds)
    
    del learn

    torch.cuda.empty_cache()

    gc.collect()

In [None]:
all_preds

In [None]:
np.mean(np.stack(all_preds*100))

In [None]:
sample_df = pd.read_csv(dataset_path/'sample_submission.csv')
preds = np.mean(np.stack(all_preds), axis=0)
sample_df['Pawpularity'] = preds*100
sample_df.to_csv('submission.csv',index=False)

In [None]:
pd.read_csv('submission.csv').head()

# **Reference**
###  [Petfinder Pawpularity EDA & fastai starter 🐱🐶](https://www.kaggle.com/tanlikesmath/petfinder-pawpularity-eda-fastai-starter)
###  [Petfinder& fastai with DataAugmentation KFold 10](https://www.kaggle.com/bobber/petfinder-fastai-with-dataaugmentation-kfold-10)

#### You can criticize my work or give your suggestion, your comment is a treasure of knowledge for me
##### P.S. sorry for a poor grammar