In [None]:
import numpy as np
import pickle
from price_prediction import (check_dataframe_features, convert_dataframe_categorical,
                    handle_missing_inplace, one_hot_encoding, cutting,
                    lower_case_df, clean_sentence, build_crossvalidation_data)

DEFAULT_CONDITION_IDS = [1, 2, 3, 4, 5]
DEFAULT_SHIPPING_IDS = [0, 1]


class NaiveFeaturiser(object):
    """
    The Featuriser class which preprocess inputs, which can
    then be appropriately fed into the Model class.

    This class consists of pre-processes which are common for
    both train and test data.

    Here is the list of preprocesses done in this class
    built from dataframe-
        [1] Checks if dataframe is in appropriate format with all
            required fields.
        [2] Removes all missing values in the dataframe to 'missing'
            value.
        [3] Lower case the text fields ['name', 'brand_name',
            'item_description'] and normalizes the text.
        [4] We build a list of popular brands that have occured a minimum
            of 5 times and convert the rest of brand names to 'missing'.
        [5] Converts ['item_condition_id', 'shipping'] fields to one hot
            encodings.
    """
    def __init__(self,
                 brand_names,
                 item_condition_set=DEFAULT_CONDITION_IDS,
                 shipping_set=DEFAULT_SHIPPING_IDS):
        """
        Parameters
        ----------
        brand_names: List[str]
            List of all popular brand names.
        item_condition_set: List[int]
            List of all possible item_condition_id values
            in request. Default to [1, 2, 3, 4, 5].
        shipping_set: List[int]
            List of all possible shipping values
            in request. Default to [0, 1].
        """
        self.popular_brand_names = brand_names
        self.item_condition_set = item_condition_set
        self.shipping_set = shipping_set

    def __call__(self, data: dict) -> dict:
        """
        Calls the featuriser methods, which processes a single
        dictionary object.

        Parameters
        ----------
        data: dict
            A dictionary object with induvidual features.

        Returns
        -------
        data: dict
            A dictionary object with processed features.
        """
        data = self._clean_request(data)
        if data['brand_name'] not in self.popular_brand_names:
            data['brand_name'] = 'missing'

        data['item_condition_id'] = \
            np.array(one_hot_encoding(
                data['item_condition_id'],
                self.item_condition_set
            ), dtype=int)

        data['shipping'] = \
            np.array(one_hot_encoding(
                data['shipping'],
                self.shipping_set
            ), dtype=int)

        return data

    @classmethod
    def build_from_dataframe(cls, df_train, df_test=None):
        """
        Builds the featuriser using the dataframe.

        Parameters
        ----------
        df_train: pd.Dataframe
            The input dataframe used for building the featuriser.
        df_test: pd.Dataframe
            The input dataframe used for processing.
            Default to None

        Returns
        -------
        cls: Naive_Featuriser
            Class object for featuriser
        df_train: pd.Dataframe
            Processed output of df_train
        df_test: pd.Dataframe
            Processed output of df_test
        """
        required_features = [
            "name", "item_condition_id", "category_name", "brand_name",
            "shipping", "seller_id", "item_description"
        ]
        check_dataframe_features(df_train, required_features, "df_train")
        df_train = handle_missing_inplace(df_train)
        df_train = lower_case_df(df_train)
        df_train["name"] = df_train["name"].apply(clean_sentence)
        df_train["item_description"] = \
            df_train["item_description"].apply(clean_sentence)

        popular_brand = df_train['brand_name'] \
            .value_counts() \
            .loc[lambda x: x.index != 'missing'] \
            .loc[lambda x: x >= 5].to_dict()
        df_train = cutting(df_train, popular_brand)
        df_train = convert_dataframe_categorical(df_train, 'item_condition_id',
                                                 DEFAULT_CONDITION_IDS)
        df_train = convert_dataframe_categorical(df_train, 'shipping',
                                                 DEFAULT_SHIPPING_IDS)

        if df_test is not None:
            check_dataframe_features(df_test, required_features, "df_test")
            df_test = handle_missing_inplace(df_test)
            df_test = lower_case_df(df_test)
            df_test["name"] = df_test["name"].apply(clean_sentence)
            df_test["item_description"] = \
                df_test["item_description"].apply(clean_sentence)

            df_test = cutting(df_test, popular_brand)
            df_test = convert_dataframe_categorical(df_test,
                                                    'item_condition_id',
                                                    DEFAULT_CONDITION_IDS)
            df_test = convert_dataframe_categorical(df_test, 'shipping',
                                                    DEFAULT_SHIPPING_IDS)

        return cls(popular_brand), df_train, df_test

    def save(self, filename):
        """
        Saves the Featuriser in the required path in pickle format.

        Parameters
        ----------
        filename: str
            location to save the model.

        """
        with open(filename, 'wb') as f:
            pickle.dump(
                {
                    "popular_brands": self.popular_brand_names,
                    "item_conditions": self.item_condition_set,
                    "shipping_ids": self.shipping_set
                }, f)

    @classmethod
    def load(cls, filename):
        """
        Loads the Featuriser from the required path in pickle format.

        Parameters
        ----------
        filename: str
            location to return the model.

        Returns
        -------
        cls: Naive_Featuriser
            The class object with loaded weights
        """
        with open(filename, 'rb') as f:
            data = pickle.load(f)
            return cls(data["popular_brands"], data["item_conditions"],
                       data["shipping_ids"])

    def _clean_request(self, data):
        """
        It performs a basic cleaning for all text fields

        Parameters
        ----------
        data: dict
            request body

        Returns
        -------
        data: dict
            processed request body
        """
        if data['brand_name'] == "":
            data['brand_name'] = 'missing'

        if data['item_description'] == "":
            data['brand_name'] = 'missing'

        data['name'] = clean_sentence(data['name'].lower())
        data['brand_name'] = data['brand_name'].lower()
        data['item_description'] = clean_sentence(
            data['item_description'].lower())
        return data


In [None]:
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor


class PricePredictionModel(object):
    """
    The Model class for building the Price Prediction task,
    where we build the model from the training dataframe provided
    to us. This model performs the following model operations

        [1] Building a bag of words model for the ["name", "category_name"]
            features using the CountVectorizer.
        [2] One hot encoding for all the major ["brand_name"]. We have already
            preprocessed it either consist of popular brands or "missing".
        [3] Tfidf transformation for the ["item_description"] feature.
        [4] We build two models using the above features
            * Ridge Model
            * LGBM Model
            And the scores from both the models are aggregated.

    This class has been built similar to the Scikit Learn API.
    """
    def __init__(self,
                 name_f,
                 category_f,
                 brand_name_f,
                 item_descp_f,
                 ridge_model,
                 lgbm_model,
                 fit_performed=False):
        """
        Parameters
        ----------
        name_f: Union[dict, CountVectorizer]
            arguments for processing the ["name"] features.
        category_f: Union[dict, CountVectorizer]
            arguments for processing the ["category_name"] features.
        brand_name_f: Union[dict, LabelBinarizer]
            arguments for processing the ["brand_name"] features.
        item_descp_f: Union[dict, TfidfVectorizer]
            arguments for processing the ["item_description"] features.
        ridge_model: Union[dict, Ridge]
            the arguments for the Ridge model.
        lgbm_model: Union[dict, LGBMRegressor]
            the arguments for the LGBM model.
        """
        if isinstance(name_f, dict):
            self.feat_name = \
              CountVectorizer(**name_f)
        else:
            self.feat_name = name_f

        if isinstance(category_f, dict):
            self.feat_category = \
              CountVectorizer(**category_f)
        else:
            self.feat_category = category_f

        if isinstance(brand_name_f, dict):
            self.feat_brand = \
              LabelBinarizer(**brand_name_f)
        else:
            self.feat_brand = brand_name_f

        if isinstance(item_descp_f, dict):
            self.feat_item_descp = \
              TfidfVectorizer(**item_descp_f)
        else:
            self.feat_item_descp = item_descp_f

        if isinstance(ridge_model, dict):
            self.ridge_model = \
              Ridge(**ridge_model)
        else:
            self.ridge_model = ridge_model

        if isinstance(lgbm_model, dict):
            self.lgbm_model = \
              LGBMRegressor(**lgbm_model)
        else:
            self.lgbm_model = lgbm_model

        self.fit_performed = fit_performed

    def predict(self, data: dict):
        """
        Predicts the output to a single entry of request

        Parameters
        ----------
        data: dict
            The input data to be predicted on.

        Returns
        -------
        pred: int
            The predicted price.
        """
        if self.fit_performed is False:
            raise AssertionError("Build the model with `fit` method and "
                                 "then use `predict` method.")

        x_name = self.feat_name.transform([data["name"]])
        x_cat = self.feat_category.transform([data["category_name"]])
        x_brand = self.feat_brand.transform([data["brand_name"]])
        x_item = self.feat_item_descp.transform([data["item_description"]])
        x_ict = data["item_condition_id"]
        x_ship = data["shipping"]
        x = hstack((x_ict, x_ship, x_item, x_brand, x_cat, x_name)).tocsr()
        pred = 0.47 * self.lgbm_model.predict(x)
        pred += 0.53 * self.lgbm_model.predict(x)
        pred = np.expm1(pred)
        return pred.item()

    def fit(self, df_train: pd.DataFrame, df_valid: pd.DataFrame = None):
        """
        Used to build the models.

        Parameters
        ----------
        df_train: pd.DataFrame
            The training data to build the model.
        df_valid: pd.DataFrame
            The validation data to infer the model performance. Default on None
        """
        Y_train = np.log1p(df_train["price"])
        X_name_train = self.feat_name.fit_transform(df_train['name'])
        X_category_train = \
            self.feat_category.fit_transform(df_train['category_name'])
        X_brand_train = \
            self.feat_brand.fit_transform(df_train['brand_name'])
        X_item_train = \
            self.feat_item_descp.fit_transform(df_train['item_description'])
        X_item_condition_train = csr_matrix(
            df_train.loc[:,
                         'item_condition_id_1':'item_condition_id_5'].values)
        X_shipping_train = csr_matrix(
            df_train.loc[:, 'shipping_0':'shipping_1'].values)
        X_train = hstack(
            (X_item_condition_train, X_shipping_train, X_item_train,
             X_brand_train, X_category_train, X_name_train)).tocsr()

        self.lgbm_model = self.lgbm_model.fit(X_train, Y_train)
        pred_train = 0.53 * self.lgbm_model.predict(X_train)

        self.ridge_model.fit(X_train, Y_train)
        pred_train += 0.47 * self.ridge_model.predict(X_train)
        train_error = mean_squared_error(Y_train, pred_train, squared=False)
        print(f"Train Error - {train_error:5.3f}")
        if df_valid is not None:
            Y_val = np.log1p(df_valid["price"])
            X_name_val = self.feat_name.transform(df_valid['name'])
            X_category_val = \
                self.feat_category.transform(df_valid['category_name'])
            X_brand_val = \
                self.feat_brand.transform(df_valid['brand_name'])
            X_item_val = \
                self.feat_item_descp.transform(df_valid['item_description'])
            X_item_condition_val = csr_matrix(
                df_valid.loc[:, 'item_condition_id_1':'item_condition_id_5'].
                values)
            X_shipping_val = csr_matrix(
                df_valid.loc[:, 'shipping_0':'shipping_1'].values)
            X_val = hstack((X_item_condition_val, X_shipping_val, X_item_val,
                            X_brand_val, X_category_val, X_name_val)).tocsr()
            pred_val = 0.53 * self.lgbm_model.predict(X_val)
            pred_val += 0.47 * self.ridge_model.predict(X_val)

            val_error = mean_squared_error(Y_val, pred_val, squared=False)
            print(f"Validation Error - {val_error:5.3f} \n")

        self.fit_performed = True

    def predict_df(self, dataset: pd.DataFrame):
        """
        Predicts the output for dataframe

        Parameters
        ----------
        data: pd.DataFrame
            The input data to be predicted on.

        Returns
        -------
        pred: np.ndarray
            The predicted price.
        """
        if self.fit_performed is False:
            raise AssertionError("Build the model with `fit` method and "
                                 "then use `predict_df` method.")
        X_name = self.feat_name.transform(dataset['name'])
        X_category = self.feat_category.transform(dataset['category_name'])
        X_brand = self.feat_brand.transform(dataset['brand_name'])
        X_item = self.feat_item_descp.transform(dataset['item_description'])
        X_item_condition = csr_matrix(
            dataset.loc[:, 'item_condition_id_1':'item_condition_id_5'].values)
        X_shipping = csr_matrix(dataset.loc[:,
                                            'shipping_0':'shipping_1'].values)
        X = hstack((X_item_condition, X_shipping, X_item, X_brand, X_category,
                    X_name)).tocsr()
        pred = 0.53 * self.lgbm_model.predict(X)
        pred += 0.47 * self.ridge_model.predict(X)
        pred = np.expm1(pred)
        return pred

    def save(self, filename):
        """
        Saves the model weights in the required path in pickle format.

        Parameters
        ----------
        filename: str
            location to save the model.

        """
        if self.fit_performed is False:
            raise AssertionError("Build the model with `fit` method and "
                                 "then use `save` method.")
        with open(filename, 'wb') as f:
            pickle.dump(
                {
                    "name_featuriser": self.feat_name,
                    "category_featuriser": self.feat_category,
                    "brand_featuriser": self.feat_brand,
                    "item_desc_featuriser": self.feat_item_descp,
                    "ridge_model": self.ridge_model,
                    "lgbm_model": self.lgbm_model
                }, f)

    @classmethod
    def load(cls, filename):
        """
        Loads the model weights from the required path in pickle format.

        Parameters
        ----------
        filename: str
            location to return the model.

        Returns
        -------
        cls: PricePredictionModel
            The class object with loaded weights
        """
        with open(filename, 'rb') as f:
            data = pickle.load(f)
            return cls(data["name_featuriser"],
                       data["category_featuriser"],
                       data["brand_featuriser"],
                       data["item_desc_featuriser"],
                       data["ridge_model"],
                       data["lgbm_model"],
                       fit_performed=True)


In [None]:
from os.path import join
import os

data_path = join(*[os.pardir, "data"])
df_train = pd.read_csv(join(data_path, "mercari_train.csv"))
df_test = pd.read_csv(join(data_path, "mercari_test.csv"))
featuriser, df_train, df_test = NaiveFeaturiser \
    .build_from_dataframe(df_train, df_test)
# Saving the Featuriser weights
# featuriser.save(join(save_path, "featuriser.pkl"))

train_datasets, val_datasets = \
    build_crossvalidation_data(df_train, split=3)
name_args = {
    'max_features': 1000,
    'min_df': 10,
    'stop_words': 'english',
}
category_args = {}
brand_args = {'sparse_output': True}
item_args = {
    'max_features': 500,
    'ngram_range': (1, 3),
    'stop_words': 'english'
}
ridge_model_args = {
    'solver': "auto",
    'fit_intercept': True,
    'random_state': 205
}

lgbm_args = {
    'learning_rate': 0.75,
    'max_depth': 2,
    'num_leaves': 50,
    'verbosity': -1,
    'metric': 'RMSE'
}

for i, dataset in enumerate(zip(train_datasets, val_datasets)):
    train, val = dataset
    print(f"Fold {i} \n")
    model = PricePredictionModel(
        name_args, category_args,
        brand_args, item_args,
        ridge_model_args,
        lgbm_args
    )
    model.predict_df(train)
    model.fit(train, val)
    # model.save(join(save_path, f"Mega_model_{i}.pkl"))
