# Load modules

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from wordcloud import WordCloud
from tqdm import tqdm_pandas, tqdm_notebook
import re
import string 

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

### Options

In [2]:
pd.set_option('display.max_colwidth', 500)
%matplotlib inline
tqdm_pandas(tqdm_notebook())

stop = stopwords.words('english') + list(string.punctuation) + ['rm']
stop.remove('not')
nltk.download('stopwords', download_dir='.')
nltk.download('punkt', download_dir='.')


[nltk_data] Downloading package stopwords to ....
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to ....
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# pandas apply in parrallel  

from multiprocessing import cpu_count, Pool
 
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
 
def parallelize_apply(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

### Constants

# Load data

In [4]:
train = pd.read_csv('data/train.tsv', sep='\t', index_col=0)
test = pd.read_csv('data/test.tsv', sep='\t', index_col=0)
print('size of train and test dataset: ', train.shape, test.shape)

#defite target, log target and remove it from train data
y_target = train.price
y_target_log = np.log1p(y_target)
train.drop(['price'], axis=1, inplace=True)

#rbind train and test dataset
train['is_train'] = 1
test['is_train'] = 0
all_data = pd.concat([train, test])

print('size of all_data and data_sample dataset: ', all_data.shape)

  mask |= (ar1 == a)


size of train and test dataset:  (1482535, 7) (693359, 6)
size of all_data and data_sample dataset:  (2175894, 7)


# Preprosessing data

### Remove NA

In [5]:
# remove NA in item_description - 4 rows
all_data['item_description'] = all_data['item_description'].fillna(value='No description yet')

# fill NA in brand_name and category_name by 'missing'
all_data[['name', 'category_name', 'brand_name']] = all_data[['name', 'category_name', 'brand_name']].fillna(value='missing', axis=1)
all_data.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,1,No description yet,1
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,Razer,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,1
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",1
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,1
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,0,Complete with certificate of authenticity,1


In [6]:
all_data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175894 entries, 0 to 693358
Data columns (total 7 columns):
name                 2175894 non-null object
item_condition_id    2175894 non-null int64
category_name        2175894 non-null object
brand_name           2175894 non-null object
shipping             2175894 non-null int64
item_description     2175894 non-null object
is_train             2175894 non-null int64
dtypes: int64(3), object(4)
memory usage: 132.8+ MB


In [7]:
all_data_without_desc = all_data[all_data.item_description == 'No description yet']
all_data_with_desc = all_data[all_data.item_description != 'No description yet']

print(all_data_without_desc.shape, all_data_with_desc.shape)

(120996, 7) (2054898, 7)


### Preprocess categorical features

In [8]:
# create 3 categorical variables from category_name

def category_split(text):
    if text == 'missing':
        return ("missing", "missing", "missing")
    else:
        try:
            text_split = text.split("/")
            return text_split
        except:
            print(text)
            return ("missing", "missing", "missing")

all_data['category'], all_data['sub_cat1'], all_data['sub_cat2'] = zip(*all_data['category_name'].progress_apply(category_split))




In [9]:
label_enc = LabelEncoder()
all_data['brand_name_bin'] = label_enc.fit_transform(all_data[['brand_name']])
all_data['category_bin'] = label_enc.fit_transform(all_data[['category']])
all_data['sub_cat1_bin'] = label_enc.fit_transform(all_data[['sub_cat1']])
all_data['sub_cat2_bin'] = label_enc.fit_transform(all_data[['sub_cat2']])

all_data.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train,category,sub_cat1,sub_cat2,brand_name_bin,category_bin,sub_cat1_bin,sub_cat2_bin
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,1,No description yet,1,Men,Tops,T-shirts,5265,5,102,773
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,Razer,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,1,Electronics,Computers & Tablets,Components & Parts,3889,1,30,215
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",1,Women,Tops & Blouses,Blouse,4588,9,103,97
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,1,Home,Home Décor,Home Décor Accents,5265,3,55,410
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,0,Complete with certificate of authenticity,1,Women,Jewelry,Necklaces,5265,9,58,542


### Preprocess text features

In [10]:
negatives = {
    "didn't": "did_not",
    "couldn't": "could_not",
    "don't": "do_not",
    "wouldn't": "would_not",
    "doesn't": "does_not",
    "wasn't": "was_not",
    "weren't": "were_not",
    "shouldn't":"should_not",
    "isn't": "is_not",
    "aren't": "are_not",
}

regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')

def tokenize_text(text, treshold=2):
    return [i for i in text.split(' ') if i not in stop and len(i) > treshold]

def data_apply_tokenize_text(data):
    return data.apply(tokenize_text)

def data_apply_len(data):
    return data.apply(len)

def clear_text(text, treshold=2):
    text = regex.sub(" ", text.lower()) # remove punctuation
    text = re.sub("\s\s+" , " ", text) # remove multiple spacas
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

def data_apply_clear_text(data):
    return data.apply(clear_text)

In [11]:
%%time

all_data['desc_clear'] = parallelize_apply(all_data.item_description, data_apply_clear_text)
all_data['name_clear'] = parallelize_apply(all_data.name, data_apply_clear_text)

CPU times: user 1.94 s, sys: 788 ms, total: 2.73 s
Wall time: 10.4 s


In [12]:
%%time

all_data['desc_tokens'] = parallelize_apply(all_data.desc_clear, data_apply_tokenize_text)
all_data['name_tokens'] = parallelize_apply(all_data.name_clear, data_apply_tokenize_text)
all_data['word_count_desc'] = parallelize_apply(all_data.desc_tokens, data_apply_len)
all_data['word_count_name'] = parallelize_apply(all_data.name_tokens, data_apply_len)

CPU times: user 22.1 s, sys: 3.08 s, total: 25.2 s
Wall time: 1min 2s


In [13]:
all_data.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train,category,sub_cat1,sub_cat2,brand_name_bin,category_bin,sub_cat1_bin,sub_cat2_bin,desc_clear,name_clear,desc_tokens,name_tokens,word_count_desc,word_count_name
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,1,No description yet,1,Men,Tops,T-shirts,5265,5,102,773,no description yet,mlb cincinnati reds t shirt size xl,"[description, yet]","[mlb, cincinnati, reds, shirt, size]",2,5
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,Razer,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,1,Electronics,Computers & Tablets,Components & Parts,3889,1,30,215,this keyboard is in great condition and works like it came out of the box all of the ports are tested and work perfectly the lights are customizable via the razer synapse app on your pc,razer blackwidow chroma keyboard,"[keyboard, great, condition, works, like, came, box, ports, tested, work, perfectly, lights, customizable, via, razer, synapse, app]","[razer, blackwidow, chroma, keyboard]",17,4
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",1,Women,Tops & Blouses,Blouse,4588,9,103,97,adorable top with a hint of lace and a key hole in the back the pale pink is a x and i also have a x available in white,ava viv blouse,"[adorable, top, hint, lace, key, hole, back, pale, pink, also, available, white]","[ava, viv, blouse]",12,3
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,1,Home,Home Décor,Home Décor Accents,5265,3,55,410,new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage,leather horse statues,"[new, tags, leather, horses, retail, stand, foot, high, sold, pair, questions, please, ask, free, shipping, got, storage]","[leather, horse, statues]",17,3
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,0,Complete with certificate of authenticity,1,Women,Jewelry,Necklaces,5265,9,58,542,complete with certificate of authenticity,k gold plated rose,"[complete, certificate, authenticity]","[gold, plated, rose]",3,3


### tf-idf

In [14]:
all_data = all_data[all_data.is_train == 1].copy().drop(['is_train'], axis=1)

In [15]:
%%time

tf_idf = TfidfVectorizer(min_df=0.001,
                     ngram_range=(1, 3),
                    tokenizer=tokenize_text)
X_description = tf_idf.fit_transform(all_data['desc_clear'])

CPU times: user 2min 58s, sys: 1.13 s, total: 2min 59s
Wall time: 2min 59s


In [16]:
%%time

X_name = tf_idf.fit_transform(all_data['name_clear'])

CPU times: user 36.5 s, sys: 104 ms, total: 36.6 s
Wall time: 36.6 s


In [15]:
num_cols = ['item_condition_id', 'shipping', 'word_count_desc', 'word_count_name', 
                      'brand_name_bin', 'category_bin', 'sub_cat1_bin', 'sub_cat2_bin']

all_data[num_cols] = MinMaxScaler().fit_transform(all_data[num_cols])
X_numeric = all_data[num_cols].copy()

In [18]:
tsvd_desc = TruncatedSVD(n_components=200)
X_description_svd = tsvd_desc.fit_transform(X_description)

In [19]:
tsvd_name = TruncatedSVD(n_components=30)
X_name_svd = tsvd_name.fit_transform(X_name)

In [20]:
data_ok = np.hstack((X_description_svd, X_name_svd, X_numeric))
data_ok.shape

(1482535, 238)

In [21]:
train_X, valid_X, train_y, valid_y = train_test_split(data_ok, y_target_log, random_state=42, test_size=0.05) 

In [24]:
%%time

param_grid_linear = {'normalize':[True]}
cv_linear = GridSearchCV(LinearRegression(), param_grid_linear, scoring='neg_mean_squared_error', cv=5,
               n_jobs=-1, verbose=1)
cv_linear.fit(train_X, train_y)
print(cv_linear.best_score_)

TypeError: __init__() got an unexpected keyword argument 'random_state'

In [23]:
preds_linear = cv_linear.best_estimator_.predict(valid_X)
print(mean_squared_error(valid_y, preds_linear))

0.416033968489


In [25]:
%%time

param_grid_rf = {'n_estimators':[100],
                'max_depth':[5]}
cv_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, scoring='neg_mean_squared_error', cv=5,
               n_jobs=-1, verbose=1)
cv_rf.fit(train_X, train_y)

print(cv_linear.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

In [26]:
preds_rf = cv_rf.best_estimator_.predict(valid_X)
print(mean_squared_error(valid_y, preds_rf))

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
%%time

param_grid_gb = {'n_estimators':[50,100,300],
                'max_depth':[10, 20]}
cv_gb = GridSearchCV(GradientBoostingRegressor(), param_grid_gb, scoring='neg_mean_squared_error', cv=5,
               n_jobs=-1, verbose=1)
cv_gb.fit(data_ok, y_target_log)
print(cv_gb.best_score_)

In [None]:
preds_gb = cv_gb.best_estimator_.predict(valid_X)
print(mean_squared_error(valid_y, preds_gb))

In [None]:
import lightgbm as lgb

d_train = lgb.Dataset(train_X, label=train_y)#, max_bin=8192)
d_valid = lgb.Dataset(valid_X, label=valid_y)#, max_bin=8192)
watchlist = [d_train, d_valid]

params = {
    'learning_rate': 0.78,
    'application': 'regression',
    'max_depth': 8,
    'num_leaves': 90,
    'verbosity': -1,
    'metric': 'RMSE',
    'nthread': 4
}

params2 = {
    'learning_rate': 0.88,
    'application': 'regression',
    'max_depth': 6,
    'num_leaves': 50,
    'verbosity': -1,
    'metric': 'RMSE',
    'nthread': 4
}

model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=250) 


In [None]:
model2 = lgb.train(params2, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=250) 

In [99]:
from sklearn.feature_extraction.text import HashingVectorizer

hash_vect = HashingVectorizer(decode_error='ignore', n_features=2 ** 16,
                               alternate_sign=False)

In [82]:
%%time

X_name = tf_idf.fit_transform(all_data['name_clear'])
X_name.shape

CPU times: user 3min 24s, sys: 488 ms, total: 3min 25s
Wall time: 3min 24s


In [100]:
%%time

X_name = hash_vect.fit_transform(all_data['name_clear'])
X_name.shape

CPU times: user 6.8 s, sys: 8 ms, total: 6.81 s
Wall time: 6.81 s


In [101]:
X_name

<2175894x65536 sparse matrix of type '<class 'numpy.float64'>'
	with 8075938 stored elements in Compressed Sparse Row format>

In [88]:
all_name_words = [arg for line in all_data.name_tokens for arg in line]

In [95]:
np.sort(np.unique(all_name_words, return_counts=True))

array([['aaa', 'aaaa', 'aaaaa', ..., '，carolina', '：）iphone', '�birthday'],
       ['1', '1', '1', ..., '997', '998', '998']],
      dtype='<U40')

In [92]:
len(all_name_words)

8097783

In [27]:
sample_subm = pd.read_csv('data/sample_submission.csv')
sample_subm.head()

Unnamed: 0,test_id,price
0,0,26.738
1,1,26.738
2,2,26.738
3,3,26.738
4,4,26.738


In [28]:
test.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7,0
1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined with bubble wrap for protection Self Sealing (peel-and-seal), adhesive keeps contents secure and tamper proof Durable and lightweight Kraft material helps save on postage Approved by UPS, FedEx, and USPS.",0
2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coach outlet.,0
3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and perfect for hot weather,0
4,Life after Death,3,Other/Books/Religion & Spirituality,,1,"Rediscovering life after the loss of a loved one by Tony Cooke. Paperback in good condition 2003. ❤ ❤ Bundle and save! ❤ ❤ Book, death, grief, bereavement SHLF.SW.5.15",0


In [29]:
preds_linear

array([ 2.32569771,  2.62552226,  2.31959537, ...,  2.98805236,
        2.83426555,  2.68013737])

In [30]:
sample_subm = pd.DataFrame({'test_id':test.index, 'price':preds_linear})
sample_subm.head()

ValueError: arrays must all be same length