# Importing Modules

In [None]:
# Standard imports
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import trange
from colorama import Fore
from glob import glob
import json
from pprint import pprint
import time
import cv2
from enum import Enum
from IPython.display import display

# For Data preparation
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *


import warnings
warnings.filterwarnings("ignore")

# Configs

In [None]:
class Config(Enum):
    '''
    It basically contains all the path location and other stuffs
    
    '''
    
    def __str__(self):
        return self.value

    TRAIN_CSV = "../input/petfinder-pawpularity-score/train.csv"
    TEST_CSV = "../input/petfinder-pawpularity-score/test.csv"
    SAMPLE_CSV = "../input/petfinder-pawpularity-score/sample_submission.csv"
    TRAIN_DIR = "../input/petfinder-pawpularity-score/train"
    TEST_DIR = "../input/petfinder-pawpularity-score/test"

# Reading Data files

In [None]:
data_df = pd.read_csv(Config.TRAIN_CSV.value)
test_df = pd.read_csv(Config.TEST_CSV.value)
sample_df = pd.read_csv(Config.SAMPLE_CSV.value)

In [None]:
data_df

In [None]:
test_df

In [None]:
sample_df

# Lets see the label distribution

In [None]:
labels = data_df["Pawpularity"]
print(f"min value of Pawpularity is : {min(labels)}")
print(f"max value of Pawpularity is : {max(labels)}")

In [None]:
def giveHistogram(df : "data File", col_name : str, bins = None, dark = False):
    """
    To create histogram plots

    """
    fig = px.histogram(df, x = col_name, template = "plotly_dark" if dark else "ggplot2", nbins = bins if bins != None else 1 + int(np.log2(len(df))))
    fig.update_layout(
            title_text = f"Distribution of {col_name}",
            title_x = 0.5,
    )
    fig.show()

giveHistogram(data_df, "Pawpularity")

# Mapping the images location 

In [None]:
data_df["path"] = data_df["Id"].apply(lambda x : Config.TRAIN_DIR.value + f"/{x}.jpg")
test_df["path"] = test_df["Id"].apply(lambda x : Config.TEST_DIR.value + f"/{x}.jpg")

# Lets see the distribution of widths and heights of the images

In [None]:
def widthAndHeightDist(df : "data_file", col_name : "col name that contains the img path", dark = False):
    widths = []; heights = []; bins = 1 + int(np.log2(len(df)))
    total_images = list(df[col_name].values) 
    for idx in trange(len(total_images), desc = "Collecting widths and heights...", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position = 0, leave = True):
        cur_path = total_images[idx]
        h, w, _ = cv2.imread(cur_path).shape
        widths.append(w)
        heights.append(h)

    figW = px.histogram(widths, nbins = bins, template = "plotly_dark" if dark else "ggplot2")
    figW.update_layout(title = 'Distribution of Image Widths', title_x = 0.5)
    figW.show();
    
    figH = px.histogram(heights, nbins = bins, template = "plotly_dark" if dark else "ggplot2")
    figH.update_layout(title = 'Distribution of Image Heights', title_x = 0.5)
    figH.show();
    
widthAndHeightDist(data_df, "path")

# Lets look at some images

In [None]:
def buildGridImages(df : "data_file", img_path_col_name: str, label_col_name: str, nrows = 5, ncols = 4, img_size = 512):
    """
    To build an image grid
    """
    
    df = df.sample(nrows*ncols)
    paths = df[img_path_col_name].values
    labels = df[label_col_name].values

    text_color = (255, 255, 255)
    box_color = (0, 0, 0)
    
    plt.figure(figsize=(20,12))
    for i in range(nrows * ncols):
        plt.subplot(nrows,ncols,i+1)
        img = cv2.imread(paths[i])
        img = cv2.resize(img, (img_size, img_size))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        plt.axis("off")
        plt.title(str(labels[i]))
        plt.imshow(img)


    plt.tight_layout()
    plt.show()    

buildGridImages(data_df, "path", "Pawpularity", 6, 6, 256)

# Lets see the key differences between minimum and maximum marks images

- Minimum is 1
- Maximum is 100
- what is **pawpularity** ?
    - *Feature engineering that the Petfinder team would find valuable would be determining if certain factors from pet profile images increase the popularity of the profile - e.g. "When dogs wear color collars, their popularity increases by x%"*
    - Answered in the [discussion](https://www.kaggle.com/c/petfinder-pawpularity-score/discussion/274025)

In [None]:
pp_100_df = data_df.loc[data_df.Pawpularity == 100]
pp_1_df = data_df.loc[data_df.Pawpularity == 1]

print(f"Num of images having 100 score : {len(pp_100_df)}")
print(f"Num of images having 1 score : {len(pp_1_df)}")

#### *Lets look 1 score images*

In [None]:
pp_1_df

In [None]:
buildGridImages(pp_1_df, "path", "Pawpularity", 1, 4, 256)

#### Lets some perfect score images

In [None]:
pp_100_df.head()

In [None]:
buildGridImages(pp_100_df, "path", "Pawpularity", 4, 4, 256)

#### Observations
- Pets in the images are getting blended with their background

#### Lets see some of the images that are 1 for each meta data

In [None]:
req_cols = [
    'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'
]

for col in req_cols:
    tmp_df = data_df.loc[data_df[col] == 1].sample(5)
    print(f"################### {col} ###################")
    buildGridImages(tmp_df, "path", "Pawpularity", 1, 5, 256)


# Lets create folds of our dataset

In [None]:
def create_folds_regression(data, target="target", num_splits = 5): 
    """
    Helper function to create folds
    
    """
    data["kfold"] = -1 
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Applying Sturg's rule to calculate the no. of bins for target
    num_bins = int(1 + np.log2(len(data))) 

    data.loc[:, "bins"] = pd.cut(data[target], bins=num_bins, labels=False) 
    
    kf = StratifiedKFold(n_splits=num_splits)
    
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)): 
        data.loc[v_, 'kfold'] = f
        
    data = data.drop(["bins"], axis = 1)         
    return data 


data_df = create_folds_regression(data_df, target = 'Pawpularity', num_splits = 5)
data_df.kfold.value_counts()

In [None]:
data_df.head()

# Lets automate our training

In [None]:
# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, VotingRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Evalution Metrix
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_validate

In [None]:
def rmse_score(y_label, y_preds):
    """
    Gives RMSE score
    """
    return np.sqrt(mean_squared_error(y_label, y_preds))
    

def trainRegModels(df : "data_file", features : list, label: str):
    """
    To automate the training of regression models. Considering
        > RMSE
        > R2 score
    
    """
    regModels = {
            "LinearRegression": LinearRegression(),
            "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=2),
            "AdaBoostRegressor": AdaBoostRegressor(random_state=0, n_estimators=100),
            "LGBMRegressor": LGBMRegressor(),
            "Ridge": Ridge(alpha=1.0),
            "ElasticNet": ElasticNet(random_state=0),
            "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
            "DecisionTreeRegressor": DecisionTreeRegressor(),
            "ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
            "RandomForestRegressor": RandomForestRegressor(n_jobs=-1),
            "XGBRegressor": XGBRegressor(n_jobs=-1),
            "CatBoostRegressor": CatBoostRegressor(iterations=900, depth=5, learning_rate=0.05, loss_function = 'RMSE'),
        }
    
    # Will return this as a data frame
    summary = {
        "Model" : [],
        "Avg R2 Train Score" : [],
        "Avg R2 Val Score" : [],
        "Avg RSME Train Score" : [],
        "Avg RSME Val Score" : []
    }
    
    # Training
    for idx in trange(len(regModels.keys()), desc = "Models are training...", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position = 0, leave = True):
        name = list(regModels.keys())[idx]
        model = regModels[name]
        
        # Initializing all the scores to 0
        r2_train = 0; r2_val = 0
        rmse_train = 0; rmse_val = 0
        
        # Running K-fold Cross-validation on every model
        for fold in range(5):
            train_df = df.loc[df.kfold != fold].reset_index(drop = True)
            val_df = df.loc[df.kfold == fold].reset_index(drop = True)
            
            train_X = train_df[features]; train_Y = train_df[label]
            val_X = val_df[features]; val_Y = val_df[label]
            
            cur_model = model
            if name == 'CatBoostRegressor':
                cur_model.fit(train_X, train_Y,verbose=False)
            else:
                cur_model.fit(train_X, train_Y)

            Y_train_preds = model.predict(train_X)
            Y_val_preds = model.predict(val_X)
            
            # Collecting the scores
            r2_train += r2_score(train_Y, Y_train_preds)
            r2_val += r2_score(val_Y, Y_val_preds)
            
            rmse_train += rmse_score(train_Y, Y_train_preds)
            rmse_val += rmse_score(val_Y, Y_val_preds)
        
        # Pushing the scores and the Model names
        summary["Model"].append(name)
        summary["Avg R2 Train Score"].append(r2_train/5)
        summary["Avg R2 Val Score"].append(r2_val/5)
        summary["Avg RSME Train Score"].append(rmse_train/5)
        summary["Avg RSME Val Score"].append(rmse_val/5)
    
    # Finally returning the summary dictionary as a dataframe
    summary_df = pd.DataFrame(summary)
    return summary_df



In [None]:
training_summary = trainRegModels(data_df, req_cols, "Pawpularity")
training_summary

In [None]:
training_summary.sort_values("Avg RSME Val Score", axis = 0, ascending = True)

***Let's see the combined power of top-2 models***

In [None]:
en = ElasticNet(random_state=0)
gbr = GradientBoostingRegressor(random_state=0)
VR_model = VotingRegressor([('en', en),('gbr', gbr)], n_jobs=-1)

r2_train = 0; r2_val = 0
rmse_train = 0; rmse_val = 0

model = VR_model
for fold in trange(5, desc = "Models are training...", bar_format="{l_bar}%s{bar:50}%s{r_bar}" % (Fore.CYAN, Fore.RESET), position = 0, leave = True):
    train_df = data_df.loc[data_df.kfold != fold].reset_index(drop = True)
    val_df = data_df.loc[data_df.kfold == fold].reset_index(drop = True)

    train_X = train_df[req_cols]; train_Y = train_df["Pawpularity"]
    val_X = val_df[req_cols]; val_Y = val_df["Pawpularity"]
    
    model.fit(train_X, train_Y)

    Y_train_preds = model.predict(train_X)
    Y_val_preds = model.predict(val_X)

    # Collecting the scores
    r2_train += r2_score(train_Y, Y_train_preds)
    r2_val += r2_score(val_Y, Y_val_preds)

    rmse_train += rmse_score(train_Y, Y_train_preds)
    rmse_val += rmse_score(val_Y, Y_val_preds)

print(f"Avg R2 Train Score : {r2_train/5}")
print(f"Avg R2 Val Score : {r2_val/5}")
print(f"Avg RSME Train Score : {rmse_train/5}")
print(f"Avg RSME Val Score : {rmse_val/5}")

🤔 ***Hmm...Nothing much improvement but lets see the submission results***

# Prediction Time 😎

In [None]:
sample_df

In [None]:
test_X = test_df[req_cols]

model_preds = model.predict(test_X)
test_df["Pawpularity"] = model_preds

submission = test_df[["Id", "Pawpularity"]]
submission.to_csv("submission.csv", index = False)
data_df.to_csv("data.csv", index = False)
test_df.to_csv("test.csv", index = False)
submission