In [1]:
# Reload modules
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
#supress warnings
import warnings
warnings.filterwarnings('ignore')

#numpy and pandas for data manipulation
import pandas as pd
import numpy as np
from numpy import median
from scipy.stats import norm
import re
import math

#matplotlib and seaborn for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set(style='darkgrid')

import plotly
from plotly.offline import iplot
import plotly.graph_objects as go
from wordcloud import WordCloud

#file system management
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
import joblib

seed = 42

In [3]:
train_data = pd.read_csv('../data/mercari_train.csv.gz', compression='gzip',
                   error_bad_lines=False)
test_data = pd.read_csv('../data/mercari_test.csv.gz', compression='gzip',
                   error_bad_lines=False)
# train_data.to_csv(r'../Solutions/train.csv', index = False)
# test_data.to_csv(r'../Solutions/test.csv', index = False)
df = pd.concat([train_data, test_data], 0)
nrow_train = train_data.shape[0]

In [4]:
df

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,seller_id,item_description
0,17,"Hold Alyssa Frye Harness boots 12R, Sz 7",3,Women/Shoes/Boots,Frye,79.0,1,211140753,Good used condition Women's Fyre harness boots...
1,19,Steve Madden booties,3,Women/Shoes/Boots,Steve Madden,31.0,0,3874531266,"The brand is actually ""Steven"" by Steve Madden..."
2,42,BCBG Tan Booties,1,Women/Shoes/Boots,,48.0,0,8341537216,Brand new! Does not include the box.
3,45,NWT Sorel Caribou boots size 8.5,1,Women/Shoes/Boots,,85.0,0,4040379892,New in box. Size 8.5
4,58,NIB Hunter Tiffany Mint Boots Size 5,1,Women/Shoes/Boots,Hunter,200.0,0,19216599,Brand new never worn only flaw is as you can s...
...,...,...,...,...,...,...,...,...,...
31784,274051,Gymshark seamless leggings,1,"Women/Athletic apparel/Pants, tights, leggings",Gymshark,,1,1055265704,"New with tags, currently sold out. Willing to ..."
31785,275511,Alo Moto white leggings XS,3,"Women/Athletic apparel/Pants, tights, leggings",Alo,,1,182218251,"Alo Yoga Moto leggings in white size XS, worn ..."
31786,276751,Lularoe Pink Raccoon Leggings in TC,1,"Women/Athletic apparel/Pants, tights, leggings",,,1,6060395146,New with tag! Black background with pink racco...
31787,277217,Reversible slate wunder under pants,2,"Women/Athletic apparel/Pants, tights, leggings",lululemon athletica,,0,5150792709,Excellent used condition size 2. Reversible to...


In [5]:
df[nrow_train:]

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,seller_id,item_description
0,46,Corral boots,2,Women/Shoes/Boots,,,0,2261402465,Corral boots in excellent condition barely used
1,88,Vince Camuto Riding boots size 6,2,Women/Shoes/Boots,Vince Camuto,,0,2021876312,super cute brown or cognac knee high riding bo...
2,212,Brand new UGG boots,1,Women/Shoes/Boots,UGG Australia,,0,5803720303,New in box
3,289,"LL Bean Boots 8"" Red sz 7M",3,Women/Shoes/Boots,L.L. Bean,,0,1065939786,Made to withstand winter climate.
4,299,Black UGGS cargo boot,3,Women/Shoes/Boots,UGG Australia,,1,479394728,"Unique, super cute and warm! EUC. Only selling..."
...,...,...,...,...,...,...,...,...,...
31784,274051,Gymshark seamless leggings,1,"Women/Athletic apparel/Pants, tights, leggings",Gymshark,,1,1055265704,"New with tags, currently sold out. Willing to ..."
31785,275511,Alo Moto white leggings XS,3,"Women/Athletic apparel/Pants, tights, leggings",Alo,,1,182218251,"Alo Yoga Moto leggings in white size XS, worn ..."
31786,276751,Lularoe Pink Raccoon Leggings in TC,1,"Women/Athletic apparel/Pants, tights, leggings",,,1,6060395146,New with tag! Black background with pink racco...
31787,277217,Reversible slate wunder under pants,2,"Women/Athletic apparel/Pants, tights, leggings",lululemon athletica,,0,5150792709,Excellent used condition size 2. Reversible to...


In [6]:
brands = df[:nrow_train].groupby('brand_name')['price'].agg(['count', 'mean']).sort_values(by=['count'], ascending=False).reset_index()

luxurious_brands = brands[:20]

In [7]:
brands = df[:nrow_train].groupby('brand_name')['price'].agg(['count', 'mean']).sort_values(by=['count'], ascending=False).reset_index()

cheap_brands = brands[:10]

In [8]:
brands = df[:nrow_train].groupby('brand_name')['price'].agg(['count', 'mean']).sort_values(by=['count'], ascending=False).reset_index()

expensive_brands = brands[:20]

In [9]:
#stopwords without no, not, etc
STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [10]:
def remove_emoji(sentence):
    """
    Remove emojis from the string
    """
    pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return pattern.sub(r'', sentence)

def process_category(input_data):
    """
    Split the category_name into 3 parts as category_0, category_1 and category_2
    """
    for i in range(3):
        
        def get_categories(ele):
            
            if type(ele) != str:
                return np.nan
        
            cat = ele.split('/')
            
            if i >= len(cat):
                return np.nan
            else:
                return cat[i]

        col_name = 'category_' + str(i)
        
        input_data[col_name] = input_data['category_name'].apply(get_categories)
        
        input_data.fillna({'category_name': 'Other'}, inplace = True)
    
    return input_data

def decontracted(phrase):
    """
    Expand and create common english contractions in the text
    """
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

def process_text(input_data, cols):
    """
    Take the text columns and process the data. Expand contractions, use regex to remove symbols/numbers, remove emojis, punctuations
    and stopwords and convert text to lowercase
    """
    for col in cols:
        
        processed_data = []
        
        for sent in input_data[col].values:
            
            sent = decontracted(sent)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            sent = remove_emoji(sent)
            sent = ' '.join(e for e in sent.split() if e not in STOPWORDS)
            processed_data.append(sent.lower().strip())
            
        input_data[col] = processed_data
        
    return input_data

def handle_missing_values(input_data):
    """
    Fills the nan/missing values with 'missing' for text columns
    """
    input_data.fillna({'name': 'missing', 'item_description': 'missing'}, inplace=True)
    
    return input_data


#nlp features
def get_text_features(input_data):
    """
    NLP features derived from the text columns
    """

    input_data['is_luxurious'] = (input_data['brand_name'].isin(luxurious_brands['brand_name'])).astype(np.int8)

    input_data['is_expensive'] = (input_data['brand_name'].isin(expensive_brands['brand_name'])).astype(np.int8)

    input_data['is_cheap'] = (input_data['brand_name'].isin(cheap_brands['brand_name'])).astype(np.int8)

    return input_data

In [11]:
def preprocess(input_data):
    """
    Process the data by handling missing values, process category_name, process text
    """
    input_data['price'] = np.log1p(input_data['price'])

    input_data = handle_missing_values(input_data)
    
    input_data = process_category(input_data)
    
    input_data = process_text(input_data, ['name', 'item_description', 'category_name'])

    return input_data

In [12]:
data = preprocess(df)
data.fillna({'category_0': 'other', 'category_1': 'other', 'category_2': 'other'}, inplace = True)

In [13]:
data

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,seller_id,item_description,category_0,category_1,category_2
0,17,hold alyssa frye harness boots 12r sz 7,3,women shoes boots,Frye,4.382027,1,211140753,good used condition women fyre harness boots v...,Women,Shoes,Boots
1,19,steve madden booties,3,women shoes boots,Steve Madden,3.465736,0,3874531266,the brand actually steven steve madden steve m...,Women,Shoes,Boots
2,42,bcbg tan booties,1,women shoes boots,,3.891820,0,8341537216,brand new does not include box,Women,Shoes,Boots
3,45,nwt sorel caribou boots size 8 5,1,women shoes boots,,4.454347,0,4040379892,new box size 8 5,Women,Shoes,Boots
4,58,nib hunter tiffany mint boots size 5,1,women shoes boots,Hunter,5.303305,0,19216599,brand new never worn flaw see picture color we...,Women,Shoes,Boots
...,...,...,...,...,...,...,...,...,...,...,...,...
31784,274051,gymshark seamless leggings,1,women athletic apparel pants tights leggings,Gymshark,,1,1055265704,new tags currently sold willing negotiate price,Women,Athletic apparel,"Pants, tights, leggings"
31785,275511,alo moto white leggings xs,3,women athletic apparel pants tights leggings,Alo,,1,182218251,alo yoga moto leggings white size xs worn time...,Women,Athletic apparel,"Pants, tights, leggings"
31786,276751,lularoe pink raccoon leggings tc,1,women athletic apparel pants tights leggings,,,1,6060395146,new tag black background pink raccoons super c...,Women,Athletic apparel,"Pants, tights, leggings"
31787,277217,reversible slate wunder pants,2,women athletic apparel pants tights leggings,lululemon athletica,,0,5150792709,excellent used condition size 2 reversible gra...,Women,Athletic apparel,"Pants, tights, leggings"


In [14]:
#NLP features
data = get_text_features(data)

data.fillna({'brand_name': ' '}, inplace = True)

#concatenate text features
data['name'] = data['name'] + ' ' + data['brand_name'] + ' ' + data['category_name']
data['text'] = data['name'] + ' ' + data['item_description']

data = data.drop(columns = ['brand_name', 'item_description', 'category_name'], axis = 1)

In [15]:
"""
Taking necessary features for modeling
"""
data = data[['price', 'name', 'category_0', 'category_1',
       'category_2', 'shipping', 'item_condition_id', 'is_expensive', 'is_luxurious', 'text']]

In [16]:
#one hot encoding of category names
def get_ohe(X_train, col_name):
    """
    Get one hot encoded features
    """
    vect = CountVectorizer()
    tr_ohe = vect.fit_transform(X_train[col_name].values)
    return tr_ohe


#tfidf word embeddings
def get_text_encodings(X_train, col_name, min_val, max_val):
    """
    Get TFIDF encodings with max_features capped at 1M
    """
    vect = TfidfVectorizer(min_df = 10, ngram_range = (min_val, max_val), max_features = 1000000)
    tr_text = vect.fit_transform(X_train[col_name].values)
    return tr_text



def generate_encodings(X_train):
    """
    Get encodings for all the features. Scale and normalize the numerical features. Stack the encoded features horizontally.
    """
    tr_ohe_category_0 = get_ohe(X_train, 'category_0')
    tr_ohe_category_1 = get_ohe(X_train,'category_1')
    tr_ohe_category_2 = get_ohe(X_train,'category_2')
    

    tr_trans = csr_matrix(pd.get_dummies(X_train[['shipping', 'item_condition_id', 'is_expensive', 'is_luxurious']], sparse=True).values)
    
    tr_name = get_text_encodings(X_train, 'name', 1, 1)
    tr_text = get_text_encodings(X_train, 'text', 1, 2)

    train_data = hstack((tr_ohe_category_0, tr_ohe_category_1, tr_ohe_category_2, tr_trans,
                       tr_name, tr_text)).tocsr().astype('float32')

    return train_data

In [17]:
data[:nrow_train]

Unnamed: 0,price,name,category_0,category_1,category_2,shipping,item_condition_id,is_expensive,is_luxurious,text
0,4.382027,hold alyssa frye harness boots 12r sz 7 Frye w...,Women,Shoes,Boots,1,3,0,0,hold alyssa frye harness boots 12r sz 7 Frye w...
1,3.465736,steve madden booties Steve Madden women shoes ...,Women,Shoes,Boots,0,3,0,0,steve madden booties Steve Madden women shoes ...
2,3.891820,bcbg tan booties women shoes boots,Women,Shoes,Boots,0,1,0,0,bcbg tan booties women shoes boots brand new...
3,4.454347,nwt sorel caribou boots size 8 5 women shoes...,Women,Shoes,Boots,0,1,0,0,nwt sorel caribou boots size 8 5 women shoes...
4,5.303305,nib hunter tiffany mint boots size 5 Hunter wo...,Women,Shoes,Boots,0,1,0,0,nib hunter tiffany mint boots size 5 Hunter wo...
...,...,...,...,...,...,...,...,...,...,...
68395,4.262680,lularoe santa tc leggings women athletic app...,Women,Athletic apparel,"Pants, tights, leggings",0,1,0,0,lularoe santa tc leggings women athletic app...
68396,4.262680,gymshark flex v3 marine blue blueberry Gymshar...,Women,Athletic apparel,"Pants, tights, leggings",0,1,0,0,gymshark flex v3 marine blue blueberry Gymshar...
68397,4.262680,lularoe hipster lions tc women athletic appa...,Women,Athletic apparel,"Pants, tights, leggings",1,2,0,0,lularoe hipster lions tc women athletic appa...
68398,4.262680,lularoe bnwt os birds wire women athletic ap...,Women,Athletic apparel,"Pants, tights, leggings",1,1,0,0,lularoe bnwt os birds wire women athletic ap...


In [18]:
y = data['price']

X = data.drop('price', axis = 1)

In [19]:
"""
Generate encoding of all the features and print the final data matrix that will be the input for the ML models
"""
X = generate_encodings(X)

In [20]:
X_train = X[:nrow_train]
Y_train = y[:nrow_train]
X_pred = X[nrow_train:]

In [21]:
X_pred.shape

(31789, 23245)

In [22]:
# splitting from all data for validation purposes
X_traina, X_testa, y_traina, y_testa = train_test_split(X_train, Y_train, test_size = 0.2, random_state = seed)

In [23]:
def get_rmsle(y_true, y_pred):
    """
    Get RMSLE score by passing actual values and the predictions from models
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))



#ridge
ridge_model = Ridge(solver='lsqr', fit_intercept=False) #solver='lsqr' reduces time to train significantly
ridge_model.fit(X_traina, y_traina)

predictions = ridge_model.predict(X_testa)
test_score = get_rmsle(np.expm1(y_testa), np.expm1(predictions))
print('Test rmsle explained: ', test_score)

Test rmsle explained:  0.49392962047841893


In [24]:
test_pred = np.expm1(ridge_model.predict(X_pred)) # Inverse transformation
print(type(test_pred))

sub = pd.read_csv('../sample_submission.csv')
sub["price"] = test_pred

sub.to_csv("../submission.csv", index=False)

<class 'numpy.ndarray'>
