### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
import re
import gc

import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from sklearn.linear_model import Ridge

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
  
STOPWORDS = set(stopwords.words('english'))
seed = 42

### Process Raw Data

In [None]:
def decontracted(text):
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [None]:
def process_text(data, cols):
    for col in cols:
        processed_data = []
        
        for sentence in data[col].values:
            sent = decontracted(sentence)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            sent = sent.strip()
            sent = ' '.join(e for e in sent.split() if e not in STOPWORDS)
            processed_data.append(sent.lower().strip()) 
        data[col] = processed_data
        
    return data

In [None]:
def preprocess(data):
    
    data.fillna({'PRODUCT_NAME':'missing', 'PRODUCT_DESCRIPTION':'missing', 'PRODUCT_BRAND':' ', 'CATEGORY':'other/other/other'}, inplace=True)
    for i in range(3):
        def sub_cat(x):    
            if type(x) != str:
                return np.nan           
            parts = x.split('/')     
            if i >= len(parts):
                return np.nan
            else:
                return parts[i]
        field_name = 'CATEGORY_' + str(i)
        data[field_name] = data['CATEGORY'].apply(sub_cat)
    
    data = process_text(data, ['PRODUCT_NAME', 'PRODUCT_DESCRIPTION'])
    
    data['NAME'] = data['PRODUCT_NAME'] + ' ' + data['PRODUCT_BRAND']
    data['DESC'] = data['PRODUCT_NAME'] + ' ' + data['PRODUCT_DESCRIPTION']
    data = data.drop(columns = ['PRODUCT_BRAND', 'PRODUCT_DESCRIPTION', 'CATEGORY'], axis = 1)
    
    return data

In [None]:
def encode(train, test, valid):
    vect_0 = CountVectorizer()
    tr_category_0 = vect_0.fit_transform(train['CATEGORY_0'].values)
    te_category_0 = vect_0.transform(test['CATEGORY_0'].values) 
    va_category_0 = vect_0.transform(valid['CATEGORY_0'].values)
    
    vect_1 = CountVectorizer()
    tr_category_1 = vect_1.fit_transform(train['CATEGORY_1'].values)
    te_category_1 = vect_1.transform(test['CATEGORY_1'].values)
    va_category_1 = vect_1.transform(valid['CATEGORY_1'].values) 
    
    vect_2 = CountVectorizer() 
    tr_category_2 = vect_2.fit_transform(train['CATEGORY_2'].values)
    te_category_2 = vect_2.transform(test['CATEGORY_2'].values) 
    va_category_2 = vect_2.transform(valid['CATEGORY_2'].values) 
    
    tr_trans = csr_matrix(pd.get_dummies(train[['SHIPPING_AVAILABILITY', 'PRODUCT_CONDITION']], sparse=True).values)
    te_trans = csr_matrix(pd.get_dummies(test[['SHIPPING_AVAILABILITY', 'PRODUCT_CONDITION']], sparse=True).values)
    va_trans = csr_matrix(pd.get_dummies(valid[['SHIPPING_AVAILABILITY', 'PRODUCT_CONDITION']], sparse=True).values)
    
    Tfidf_name = TfidfVectorizer(min_df = 10, ngram_range = (1, 1), max_features = 1000000) 
    tr_name = Tfidf_name.fit_transform(train['NAME'].values)
    te_name = Tfidf_name.transform(test['NAME'].values)
    va_name = Tfidf_name.transform(valid['NAME'].values)

    Tfidf_desc = TfidfVectorizer(min_df = 10, ngram_range = (1, 1), max_features = 1000000) 
    
    tr_desc = Tfidf_desc.fit_transform(train['DESC'].values)
    te_desc = Tfidf_desc.transform(test['DESC'].values)
    va_desc = Tfidf_desc.transform(valid['DESC'].values)

    train_data = hstack((tr_category_0, tr_category_1, tr_category_2, tr_trans, tr_name, tr_desc)).tocsr().astype('float32')

    test_data = hstack((te_category_0, te_category_1, te_category_2, te_trans, te_name, te_desc)).tocsr().astype('float32')
    
    valid_data = hstack((va_category_0, va_category_1, va_category_2, va_trans, va_name, va_desc)).tocsr().astype('float32')
    
    del train, tr_category_0, tr_category_1, tr_category_2, tr_trans, tr_name, tr_desc
    del test, te_category_0, te_category_1, te_category_2, te_trans, te_name, te_desc
    del valid, va_category_0, va_category_1, va_category_2, va_trans, va_name, va_desc
    gc.collect()

    return train_data, test_data, valid_data

## Data 

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
print(train.shape)
print(test.shape)

### Processing Data

In [None]:
brand_name = train["PRODUCT_BRAND"]
desc_data = train["PRODUCT_DESCRIPTION"]
price_data = train["PRODUCT_PRICE"]
train["log_Price"] = np.log(train["PRODUCT_PRICE"] + 1)

In [None]:
train = preprocess(train)

In [None]:
test = preprocess(test)

## Exploratory Data Analysis

In [None]:
x = train['CATEGORY_0'].value_counts().index.values.astype('str')
y = train['CATEGORY_0'].value_counts().values
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))]

In [None]:
cat_0 = train.CATEGORY_0
order = sorted(cat_0.unique())
fig, ax = plt.subplots(1, 2, figsize = (15, 10))
sns.boxplot(x = train.log_Price, y = cat_0, orient = "h", order = order, ax = ax[0])
ax[0].set_title("Log Price Base On CATEGORY_0", fontsize = 20)
ax[0].set_ylabel("Categories", fontsize = 15)
sns.barplot(x = cat_0.value_counts().values,y =  cat_0.value_counts().index, order = order, ax = ax[1])
ax[1].set_title("Number of items in each category", fontsize = 20)
plt.show()

In [None]:
trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title= 'Item Distribution in Category_0',
              yaxis = dict(title='Count'),
              xaxis = dict(title='Category_0'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)

In [None]:
x = train['CATEGORY_1'].value_counts().index.values.astype('str')[:15]
y = train['CATEGORY_1'].value_counts().values[:15]
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))][:15]

In [None]:
trace1 = go.Bar(x=x, y=y, text=pct,
                marker=dict(
                color = y,colorscale='Portland',showscale=True,
                reversescale = False
                ))
layout = dict(title= 'Item Distribution in Category 1 (Top 15)',
              yaxis = dict(title='Count'),
              xaxis = dict(title='Category_1'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)

In [None]:
x = train['CATEGORY_2'].value_counts().index.values.astype('str')[:15]
y = train['CATEGORY_2'].value_counts().values[:15]
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))][:15]

In [None]:
trace1 = go.Bar(x=x, y=y, text=pct,
                marker=dict(
                color = y,colorscale='Portland',showscale=True,
                reversescale = False
                ))
layout = dict(title= 'Item Distribution in Category 2 (Top 15)',
              yaxis = dict(title='Count'),
              xaxis = dict(title='Category_2'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)

In [None]:
Cat_0 = train['CATEGORY_0'].unique()
x = [train.loc[train['CATEGORY_0']==cat, 'PRODUCT_PRICE'] for cat in Cat_0]

In [None]:
data = [go.Box(x=np.log(x[i]+1), name=Cat_0[i]) for i in range(len(Cat_0))]

In [None]:
layout = dict(title="Price Distribution by Category_0",
              yaxis = dict(title='Frequency'),
              xaxis = dict(title='Category'))
fig = dict(data=data, layout=layout)
py.iplot(fig)

In [None]:
plt.subplot(1, 2, 1)
(train['PRODUCT_PRICE']).plot.hist(bins=50, figsize=(15,8),range=[0,250])
plt.xlabel('PRODUCT_PRICE+', fontsize=12)
plt.ylabel('frequency', fontsize=12)
plt.tick_params(labelsize=12)
plt.title('Price Distribution - Training Set', fontsize=17)

plt.subplot(1, 2, 2)
np.log(train['PRODUCT_PRICE']+1).plot.hist(bins=50, figsize=(15,8))
plt.xlabel('log(PRODUCT_PRICE+1)', fontsize=12)
plt.ylabel('frequency', fontsize=12)
plt.tick_params(labelsize=12)
plt.title('Log(Price) Distribution - Training Set', fontsize=12)
plt.show()

In [None]:
train.SHIPPING_AVAILABILITY.value_counts()/len(train)

In [None]:
plt.figure(figsize = (15, 8))
plt.hist(train[train.SHIPPING_AVAILABILITY == 1].log_Price, bins = 50, alpha = 0.5, label = "log price with free shipping")
plt.hist(train[train.SHIPPING_AVAILABILITY == 0].log_Price, bins = 50, alpha = 0.5, label = "log price with shipping")
plt.legend(fontsize = 10)
plt.show()

In [None]:
print("There are %d unique values in the category 0 column." % train['CATEGORY_0'].nunique())

In [None]:
print("There are %d unique values in category 1." % train['CATEGORY_1'].nunique())

In [None]:
print("There are %d unique values in category 2." % train['CATEGORY_2'].nunique())

In [None]:
fog, ax = plt.subplots(1, 5, figsize = (15, 8))
for i in range(1, 6):
    train[train.PRODUCT_CONDITION == i].log_Price.plot.hist(ax = ax[i-1], bins = 50, alpha = 0.5)  
    ax[i-1].set_xlabel("Log Price")
    ax[i-1].set_title("Item Condition Id = " + " " + str(i))
plt.show()

In [None]:
brand = brand_name.value_counts()
print("There are", brand.size, "unique known brands.")

In [None]:
plt.figure(figsize = (10, 10))
sns.barplot(x = brand[1:11].values, y = brand[1:11].index)
plt.title("Top 10 known brand in store")
plt.show()

In [None]:
brands = ["Nike","PINK",  "Victoria's Secret", "LuLaRoe"]
nbrand = len(brands)

fig, ax = plt.subplots(2, 2, figsize = (15, 10))
for b in range(nbrand):
    brand = brands[b]
    for i in range(1, 6):
        sns.distplot(train[brand_name == brand][train["PRODUCT_CONDITION"] == i].log_Price, hist = False,
                     label = "PRODUCT_CONDITION = " + " " + str(i), ax = ax[int(b/2)][b%2])
    ax[int(b/2)][b%2].set_xlabel("Log Price")
    ax[int(b/2)][b%2].set_title("Price of " + brand + " in each item condition")
plt.show()

In [None]:
mean_cat1_price = pd.DataFrame(train.groupby(["CATEGORY_1"]).PRODUCT_PRICE.mean())
mean_cat1_price = mean_cat1_price.sort_values(by = "PRODUCT_PRICE", ascending = False)[:10]
mean_cat1_price.reset_index(level = 0, inplace = True)

plt.figure(figsize = (10, 5))
sns.barplot(x = "PRODUCT_PRICE", y = "CATEGORY_1", data = mean_cat1_price, orient = "h")
plt.title("Mean Price Base On CATEGORY_1 (Top 10)", fontsize = 20)
plt.ylabel("Subcategories", fontsize = 20)
plt.xlabel("Price", fontsize = 20)
plt.show()

In [None]:
y_tr = np.log1p(train['PRODUCT_PRICE'])
train.drop(['PRODUCT_PRICE'], axis=1, inplace=True)
train.drop(['log_Price'], axis=1, inplace=True)
X_train, X_valid , y_train, y_valid = train_test_split(train, y_tr, test_size=0.15, random_state=42)

print('Train size: {}, Valid size: {}, Test size: {}' .format(y_train.shape, y_valid.shape, test.shape))

In [None]:
train.head()

### Generating Encodings

In [None]:
X_train, X_test, X_valid = encode(X_train, test, X_valid)

In [None]:
print('Train size: {}, valid size: {}, Test size: {}' .format(X_train.shape, X_valid.shape, X_test.shape))

In [None]:
x = [1.25,1.5,1.75,2,2.1,2.2,2.5,3]
y = [] 
for i in tqdm(x):
    model = Ridge(solver="auto", random_state=42, alpha=i)
    model.fit(X_train, y_train)
    valid_preds = model.predict(X_valid)
    loss = (sqrt(mse(y_valid, valid_preds)))
    y.append(loss)
    print ('RMSLE for alpha = ',i,'is',loss)
    
Alpha = np.argmin(y)

In [None]:
fig, ax = plt.subplots()
ax.plot(x, y)
ax.scatter(x, y)
for i, txt in enumerate(np.round(y,3)):
    ax.annotate((x[i],np.round(txt,3)), (x[i],y[i]))

plt.title("RMSLE for each alpha")
plt.xlabel("Alpha")
plt.ylabel("Error")
plt.show()

In [None]:
print("Best alpha: ",  x[Alpha])
model = Ridge(solver="auto", random_state=42, alpha=x[Alpha])
model.fit(X_train, y_train)

preds_tr = model.predict(X_train)
preds_va = model.predict(X_valid)
preds_te = model.predict(X_test)
print('Train RMSLE:', sqrt(mse(y_train, preds_tr)))

error = sqrt(mse(y_valid, preds_va))
print("Cross validation RMSLE: ", error)

In [None]:
model = Ridge(solver="auto", random_state=42, alpha=2.1)
x_train = sp.vstack((X_train, X_valid))
Y_train = pd.concat([y_train, y_valid])
model.fit(x_train, Y_train)

preds_te = model.predict(X_test)
result = np.expm1(preds_te)

In [None]:
file = pd.DataFrame({'PRODUCT_ID' : test.PRODUCT_ID.values, 'PRODUCT_PRICE' : result})
file.to_csv('submission.csv', index = False)