In [None]:
# Part of Mercari assignments
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd


def handle_missing_inplace(dataset):
    dataset['brand_missing'] = dataset['brand_name'].isna().astype(int)
    dataset['descp_missing'] = dataset['item_description'].isna().astype(int)

    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)
    return dataset


def build_crossvalidation_data(dataset, split=5):
    from sklearn.model_selection import KFold
    # y = dataset['price']
    train_datasets = []
    test_datasets = []
    skf = KFold(n_splits=split, shuffle=True, random_state=402)
    skf.get_n_splits(dataset)
    for train_index, test_index in skf.split(dataset):
        train = dataset.iloc[train_index]
        validation = dataset.iloc[test_index]
        train_datasets.append(train)
        test_datasets.append(validation)
    return train_datasets, test_datasets


def lower_case_df(dataset):
    dataset['name'] = dataset['name'].str.lower()
    dataset['brand_name'] = dataset['brand_name'].str.lower()
    dataset['item_description'] = dataset['item_description'].str.lower()
    return dataset


def cutting(dataset, pop_brand):
    dataset['brand_name'] = dataset['brand_name'] \
        .apply(lambda x: x if x in pop_brand else "missing")
    return dataset


def check_dataframe_features(df, required_features, df_type):
    for feature in required_features:
        try:
            assert feature in df
        except KeyError:
            raise KeyError(f"{feature} not present in {df_type} dataframe")


def convert_dataframe_categorical(dataset, field, value_set):
    from pandas.api.types import CategoricalDtype
    cat_type = CategoricalDtype(
        categories=value_set,
        ordered=True)
    dataset[field] = dataset[field].astype(cat_type)
    dataset = pd.get_dummies(
        dataset,
        columns=[field])
    return dataset


def one_hot_encoding(x, allowable_set, encode_unknown=False):
    if encode_unknown and (allowable_set[-1] is not None):
        allowable_set.append(None)

    if encode_unknown and (x not in allowable_set):
        x = None

    return list(map(lambda s: x == s, allowable_set))

def clean_sentence(text):
    text = re.sub(r'(\w+:\/\/\S+)|http.+?', "", text)
    text = re.sub(r'(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])', '', text.lower())
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)

    return text.strip()


In [None]:
import numpy as np
import pickle
import re

DEFAULT_CONDITION_IDS = [1, 2, 3, 4, 5]
DEFAULT_SHIPPING_IDS = [0, 1]


class Naive_Featuriser(object):
    """
    The Featuriser class which preprocess inputs, which can
    then be appropriately fed into the Model class.

    This class consists of pre-processes which are common for
    both train and test data.

    Here is the list of preprocesses done in this class
    built from dataframe-
        [1] Checks if dataframe is in appropriate format with all
            required fields.
        [2] Removes all missing values in the dataframe to 'missing'
            value.
        [3] Lower case the text fields ['name', 'brand_name',
            'item_description'].
        [4] We build a list of popular brands that have occured a minimum
            of 5 times and convert the rest of brand names to 'missing'.
        [5] Converts ['item_condition_id', 'shipping'] fields to one hot
            encodings.
    """
    def __init__(self,
                 brand_names,
                 item_condition_set=DEFAULT_CONDITION_IDS,
                 shipping_set=DEFAULT_SHIPPING_IDS):
        """
        Parameters
        ----------
        brand_names: List[str]
            List of all popular brand names.
        item_condition_set: List[int]
            List of all possible item_condition_id values
            in request. Default to [1, 2, 3, 4, 5].
        shipping_set: List[int]
            List of all possible shipping values
            in request. Default to [0, 1].
        """
        self.popular_brand_names = brand_names
        self.item_condition_set = item_condition_set
        self.shipping_set = shipping_set

    @classmethod
    def build_from_dataframe(cls, df_train, df_test=None):
        """
        Builds the featuriser using the dataframe.

        Parameters
        ----------
        df_train: pd.Dataframe
            The input dataframe used for building the featuriser.
        df_test: pd.Dataframe
            The input dataframe used for processing.
            Default to None

        Returns
        -------
        cls: Naive_Featuriser
            Class object for featuriser
        df_train: pd.Dataframe
            Processed output of df_train
        df_test: pd.Dataframe
            Processed output of df_test
        """
        required_features = [
            "name", "item_condition_id", "category_name", "brand_name",
            "shipping", "seller_id", "item_description"
        ]
        check_dataframe_features(df_train, required_features, "df_train")
        df_train['pri_category'] = df_train['category_name'].apply(lambda x: x.split('/')[1])
        df_train['sec_category'] = df_train['category_name'].apply(lambda x: x.split('/')[2])
        df_train = handle_missing_inplace(df_train)
        df_train = lower_case_df(df_train)
        df_train["name"] = df_train["name"].apply(clean_sentence)
        df_train["item_description"] = df_train["item_description"].apply(clean_sentence)
        popular_brand = df_train['brand_name'] \
            .value_counts() \
            .loc[lambda x: x.index != 'missing'] \
            .loc[lambda x: x >= 10].to_dict()
        df_train = cutting(df_train, popular_brand)
        df_train = convert_dataframe_categorical(df_train, 'item_condition_id',
                                                 DEFAULT_CONDITION_IDS)
        df_train = convert_dataframe_categorical(df_train, 'shipping',
                                                 DEFAULT_SHIPPING_IDS)

        if df_test is not None:
            check_dataframe_features(df_test, required_features, "df_test")
            df_test['pri_category'] = df_test['category_name'].apply(lambda x: x.split('/')[1])
            df_test['sec_category'] = df_test['category_name'].apply(lambda x: x.split('/')[2])

            df_test = handle_missing_inplace(df_test)
            df_test = lower_case_df(df_test)
            df_train["name"] = df_train["name"].apply(clean_sentence)
            df_train["item_description"] = df_train["item_description"].apply(clean_sentence)

            df_test = cutting(df_test, popular_brand)
            df_test = convert_dataframe_categorical(df_test,
                                                    'item_condition_id',
                                                    DEFAULT_CONDITION_IDS)
            df_test = convert_dataframe_categorical(df_test, 'shipping',
                                                    DEFAULT_SHIPPING_IDS)

        return cls(popular_brand), df_train, df_test


In [None]:
import numpy as np
import pickle
from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor

import multiprocessing
cores = multiprocessing.cpu_count()

class PricePredictionModel(object):
    def __init__(self, pri_category_f, sec_category_f, brand_name_f_1,
                 lgbm_model_1, lgbm_model_2):

        if isinstance(pri_category_f, CountVectorizer):
            self.pri_category_feat = pri_category_f
        else:
            self.pri_category_feat = \
              CountVectorizer(**pri_category_f)

        if isinstance(sec_category_f, CountVectorizer):
            self.sec_category_feat = sec_category_f
        else:
            self.sec_category_feat = \
              CountVectorizer(**sec_category_f)

        if isinstance(brand_name_f_1, LabelBinarizer):
            self.brand_feat_1 = brand_name_f_1
        else:
            self.brand_feat_1 = \
              LabelBinarizer(**brand_name_f_1)

        self.middle_tfidf = TfidfTransformer()

        if isinstance(lgbm_model_1, dict):
            self.lgbm_model_1 = \
              Ridge(**lgbm_model_1)
        else:
            self.lgbm_model_1 = lgbm_model_1

        if isinstance(lgbm_model_2, dict):
            self.lgbm_model_2 = \
              LGBMRegressor(**lgbm_model_2)
        else:
            self.lgbm_model_2 = lgbm_model_2



    def fit(self, df_train, df_valid=None):
        Y_train = np.log1p(df_train["price"])

        gensim_corpus = df_train['name'] \
                        .apply(
                            lambda x: word_tokenize(clean_sentence(x))) \
                        .tolist()
        gensim_corpus = [
            TaggedDocument(doc, [i]) for i, doc in enumerate(gensim_corpus)
        ]
        self.name_feat = Doc2Vec(
            documents=gensim_corpus, 
            vector_size=100, window=5, 
            min_count=1, dm=0, 
            negative = 0, workers=cores, 
            epoch=30)
        X_name_train = np.array([self.name_feat.infer_vector(word_tokenize(clean_sentence(x))) for x in df_train['name']])
        
        print("Name featuriser shape", X_name_train.shape)

        X_item_condition_train = csr_matrix(
            df_train.loc[:,
                         'item_condition_id_1':'item_condition_id_5'].values)

        X_category_train_1 = \
            self.pri_category_feat.fit_transform(df_train['pri_category'])
        X_category_train_2 = \
            self.sec_category_feat.fit_transform(df_train['sec_category'])

        X_brand_train_1 = \
            self.brand_feat_1.fit_transform(df_train['brand_name'])

        X_shipping_train = csr_matrix(
            df_train.loc[:, 'shipping_0':'shipping_1'].values)

        gensim_corpus = df_train['item_description'] \
                        .apply(
                            lambda x: word_tokenize(clean_sentence(x))) \
                        .tolist()
        gensim_corpus = [
            TaggedDocument(doc, [i]) for i, doc in enumerate(gensim_corpus)
        ]
        self.item_descp_feat = Doc2Vec(
            documents=gensim_corpus, 
            vector_size=200, window=5, 
            min_count=1, dm=0, 
            negative = 0, workers=cores, 
            epoch=30)
        X_item_descp_train = np.array([self.item_descp_feat.infer_vector(word_tokenize(clean_sentence(x))) for x in df_train['item_description']])
        print("Description featuriser shape", X_item_descp_train.shape)

        X_missing_train = csr_matrix(
            df_train.loc[:, ['brand_missing', 'descp_missing']].values)
        X_train = hstack(
            (X_name_train, X_item_condition_train, 
             X_category_train_1, X_category_train_2,
             X_brand_train_1,
             X_shipping_train,
             X_item_descp_train,
             X_missing_train
            )).tocsr()
        print("train input shape", X_train.shape)

        # self.lgbm_model = self.lgbm_model.fit(X_train, Y_train)
        # pred_train = 0.53 * self.lgbm_model.predict(X_train)

        self.lgbm_model_1 = \
          self.lgbm_model_1.fit(X_train, Y_train) 
        pred_train = self.lgbm_model_1.predict(X_train)

        # self.lgbm_model_2 = \
        #   self.lgbm_model_2.fit(X_train, Y_train) 
        # pred_train += 0.47*self.lgbm_model_2.predict(X_train)

        train_error = mean_squared_error(Y_train, pred_train, squared=False)
        print(f"Train Error - {train_error:5.3f}")
        if df_valid is not None:
            Y_val = np.log1p(df_valid["price"])

            X_name_val = np.array([self.name_feat.infer_vector(word_tokenize(clean_sentence(x))) for x in df_valid['name']])
        
            # assert X_name_val.shape[1] == X_name_train.shape[1]

            X_item_condition_val = csr_matrix(
                df_valid.loc[:, 'item_condition_id_1':'item_condition_id_5'].
                values)
            # assert X_item_condition_val.shape[1] == X_item_condition_train.shape[1]

            X_category_val_1 = \
                self.pri_category_feat.transform(df_valid['pri_category'])
            X_category_val_2 = \
                self.sec_category_feat.transform(df_valid['sec_category'])
            # assert X_category_val_1.shape[1] == X_category_train_1.shape[1]
            # assert X_category_val_2.shape[1] == X_category_train_2.shape[1]

            X_brand_val_1 = \
                self.brand_feat_1.transform(df_valid['brand_name'])
            # assert X_brand_val_1.shape[1] == X_brand_train_1.shape[1]
            # assert X_brand_val_2.shape[1] == X_brand_train_2.shape[1]

            X_shipping_val = csr_matrix(
                df_valid.loc[:, 'shipping_0':'shipping_1'].values)
            # assert X_shipping_val.shape[1] == X_shipping_train.shape[1]

            X_item_descp_val = np.array([self.item_descp_feat.infer_vector(word_tokenize(clean_sentence(x))) for x in df_valid['item_description']])
            # assert X_item_descp_val.shape[1] == X_item_descp_train.shape[1]

            X_missing_val = csr_matrix(
                df_valid.loc[:, ['brand_missing', 'descp_missing']].values)
            # assert X_missing_val.shape[1] == X_missing_train.shape[1]

            X_val = hstack(
                (X_name_val, X_item_condition_val,
                 X_category_val_1, X_category_val_2,
                 X_brand_val_1,
                 X_shipping_val,
                 X_item_descp_val,
                 X_missing_val
                 )).tocsr()
            # assert X_val.shape[1] == X_train.shape[1]

            pred_val = self.lgbm_model_1.predict(X_val)
            # pred_val += 0.47*self.lgbm_model_2.predict(X_val)

            val_error = mean_squared_error(Y_val, pred_val, squared=False)
            print(f"Validation Error - {val_error:5.3f} \n")

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
path = "/content/gdrive/MyDrive/Mercari/Data/"
df_train = pd.read_csv(path+"mercari_train.csv")
df_test = pd.read_csv(path+"mercari_test.csv")

featuriser, df_train, df_test = Naive_Featuriser \
    .build_from_dataframe(df_train, df_test)

name_args = {
    'min_df': 2,
    'stop_words': frozenset(['the', 'a', 'an', 'is', 'it', 'this', ]),
    'ngram_range': (1, 2),
    'lowercase': False
}

category_args_1 = {}
category_args_2 = {}

brand_args_1 = {'sparse_output': True}

item_args = {
    'ngram_range': (1, 3),
    'min_df': 2,
    'stop_words': frozenset(['the', 'a', 'an', 'is', 'it', 'this', ]),
    'lowercase': False
}

lgbm_args_1 = {
    'solver':"auto", 
    'fit_intercept':True, 
    'random_state':205
}

lgbm_args_2 = {
    'learning_rate': 0.85,
    'max_depth': 3,
    'num_leaves': 110,
    'verbosity': -1,
    'metric': 'RMSE',
}

train_datasets, val_datasets = \
    build_crossvalidation_data(df_train, split=3)

for i, dataset in enumerate(zip(train_datasets, val_datasets)):
  train, val = dataset
  print(f"Fold {i} \n")
  model = PricePredictionModel(
      category_args_1,
      category_args_2,
      brand_args_1,
      lgbm_args_1,
      lgbm_args_2
  )
  model.fit(train, val)


Fold 0 

Name featuriser shape (45600, 100)
Description featuriser shape (45600, 200)
train input shape (45600, 623)
Train Error - 0.573
Validation Error - 0.574 

Fold 1 

Name featuriser shape (45600, 100)
Description featuriser shape (45600, 200)
train input shape (45600, 623)
Train Error - 0.574
Validation Error - 0.573 

Fold 2 

Name featuriser shape (45600, 100)
Description featuriser shape (45600, 200)
train input shape (45600, 623)
Train Error - 0.570
Validation Error - 0.580 

