# Load modules

In [71]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from wordcloud import WordCloud
from tqdm import tqdm_pandas, tqdm_notebook
import re
import string 

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Options

In [54]:
pd.set_option('display.max_colwidth', 500)
%matplotlib inline
tqdm_pandas(tqdm_notebook())

stop = stopwords.words('english') + list(string.punctuation) + ['rm']
stop.remove('not')
nltk.download('stopwords', download_dir='.')
nltk.download('punkt', download_dir='.')



[nltk_data] Downloading package stopwords to ....
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to ....
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# pandas apply in parrallel  

from multiprocessing import cpu_count, Pool
 
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
 
def parallelize_apply(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

### Constants

# Load data

In [12]:
train = pd.read_csv('data/train.tsv', sep='\t', index_col=0)
test = pd.read_csv('data/test.tsv', sep='\t', index_col=0)
print('size of train and test dataset: ', train.shape, test.shape)

#defite target, log target and remove it from train data
y_target = train.price
y_target_log = np.log1p(y_target)
train.drop(['price'], axis=1, inplace=True)

#rbind train and test dataset
train['is_train'] = 1
test['is_train'] = 0
all_data = pd.concat([train, test])

print('size of all_data and data_sample dataset: ', all_data.shape)

  mask |= (ar1 == a)


size of train and test dataset:  (1482535, 7) (693359, 6)
size of all_data and data_sample dataset:  (2175894, 7)


# Preprosessing data

### Remove NA

In [15]:
# remove NA in item_description - 4 rows
all_data['item_description'] = all_data['item_description'].fillna(value='No description yet')

# fill NA in brand_name and category_name by 'missing'
all_data[['name', 'category_name', 'brand_name']] = all_data[['name', 'category_name', 'brand_name']].fillna(value='missing', axis=1)
all_data.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,1,No description yet,1
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,Razer,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,1
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",1
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,1
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,0,Complete with certificate of authenticity,1


In [16]:
all_data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175894 entries, 0 to 693358
Data columns (total 7 columns):
name                 2175894 non-null object
item_condition_id    2175894 non-null int64
category_name        2175894 non-null object
brand_name           2175894 non-null object
shipping             2175894 non-null int64
item_description     2175894 non-null object
is_train             2175894 non-null int64
dtypes: int64(3), object(4)
memory usage: 132.8+ MB


In [17]:
all_data_without_desc = all_data[all_data.item_description == 'No description yet']
all_data_with_desc = all_data[all_data.item_description != 'No description yet']

print(all_data_without_desc.shape, all_data_with_desc.shape)

(120996, 7) (2054898, 7)


### Preprocess categorical features

In [18]:
# create 3 categorical variables from category_name

def category_split(text):
    if text == 'missing':
        return ("missing", "missing", "missing")
    else:
        try:
            text_split = text.split("/")
            return text_split
        except:
            print(text)
            return ("missing", "missing", "missing")

all_data['category'], all_data['sub_cat1'], all_data['sub_cat2'] = zip(*all_data['category_name'].progress_apply(category_split))




In [69]:
label_enc = LabelEncoder()
all_data['brand_name_bin'] = label_enc.fit_transform(all_data[['brand_name']])
all_data['category_bin'] = label_enc.fit_transform(all_data[['category']])
all_data['sub_cat1_bin'] = label_enc.fit_transform(all_data[['sub_cat1']])
all_data['sub_cat2_bin'] = label_enc.fit_transform(all_data[['sub_cat2']])

all_data.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train,category,sub_cat1,sub_cat2,desc_tokens,name_tokens,word_count_desc,word_count_name,brand_name_bin,category_bin,sub_cat1_bin,sub_cat2_bin
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,1,No description yet,1,Men,Tops,T-shirts,"[description, yet]","[mlb, cincinnati, reds, shirt, size]",2,5,5265,5,102,773
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,Razer,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,1,Electronics,Computers & Tablets,Components & Parts,"[keyboard, great, condition, works, like, came, box, ports, tested, work, perfectly, lights, customizable, via, razer, synapse, app]","[razer, blackwidow, chroma, keyboard]",17,4,3889,1,30,215
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",1,Women,Tops & Blouses,Blouse,"[adorable, top, hint, lace, key, hole, back, pale, pink, also, available, white]","[ava, viv, blouse]",12,3,4588,9,103,97
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,1,Home,Home Décor,Home Décor Accents,"[new, tags, leather, horses, retail, stand, foot, high, sold, pair, questions, please, ask, free, shipping, got, storage]","[leather, horse, statues]",17,3,5265,3,55,410
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,0,Complete with certificate of authenticity,1,Women,Jewelry,Necklaces,"[complete, certificate, authenticity]","[gold, plated, rose]",3,3,5265,9,58,542


### Preprocess text features

In [63]:
negatives = {
    "didn't": "did_not",
    "couldn't": "could_not",
    "don't": "do_not",
    "wouldn't": "would_not",
    "doesn't": "does_not",
    "wasn't": "was_not",
    "weren't": "were_not",
    "shouldn't":"should_not",
    "isn't": "is_not",
    "aren't": "are_not",
}

regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')

def tokenize_text(text, treshold=2):
    text = regex.sub(" ", text.lower()) # remove punctuation
    text = re.sub("\s\s+" , " ", text) # remove multiple spacas
    for k, v in negatives.items():
        text = text.lower().replace(k, v)
    return [i for i in word_tokenize(text) if i not in stop and len(i) > treshold]

def data_apply_tokenize_text(data):
    return data.apply(tokenize_text)

def data_apply_len(data):
    return data.apply(len)

In [61]:
all_data['desc_tokens'] = parallelize_apply(all_data.item_description, data_apply_tokenize_text)
all_data['name_tokens'] = parallelize_apply(all_data.name, data_apply_tokenize_text)
all_data['word_count_desc'] = parallelize_apply(all_data.desc_tokens, data_apply_len)
all_data['word_count_name'] = parallelize_apply(all_data.name_tokens, data_apply_len)







In [102]:
all_data.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_train,category,sub_cat1,sub_cat2,desc_tokens,name_tokens,word_count_desc,word_count_name,brand_name_bin,category_bin,sub_cat1_bin,sub_cat2_bin,desc_clear,name_clear
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,1,No description yet,1,Men,Tops,T-shirts,"[description, yet]","[mlb, cincinnati, reds, shirt, size]",2,5,5265,5,102,773,description yet,mlb cincinnati reds shirt size
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,Razer,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,1,Electronics,Computers & Tablets,Components & Parts,"[keyboard, great, condition, works, like, came, box, ports, tested, work, perfectly, lights, customizable, via, razer, synapse, app]","[razer, blackwidow, chroma, keyboard]",17,4,3889,1,30,215,keyboard great condition works like came box ports tested work perfectly lights customizable via razer synapse app,razer blackwidow chroma keyboard
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",1,Women,Tops & Blouses,Blouse,"[adorable, top, hint, lace, key, hole, back, pale, pink, also, available, white]","[ava, viv, blouse]",12,3,4588,9,103,97,adorable top hint lace key hole back pale pink also available white,ava viv blouse
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,1,Home,Home Décor,Home Décor Accents,"[new, tags, leather, horses, retail, stand, foot, high, sold, pair, questions, please, ask, free, shipping, got, storage]","[leather, horse, statues]",17,3,5265,3,55,410,new tags leather horses retail stand foot high sold pair questions please ask free shipping got storage,leather horse statues
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,0,Complete with certificate of authenticity,1,Women,Jewelry,Necklaces,"[complete, certificate, authenticity]","[gold, plated, rose]",3,3,5265,9,58,542,complete certificate authenticity,gold plated rose


### tf-idf

In [73]:
tf_idf = TfidfVectorizer(min_df=0.0005,
                     ngram_range=(1, 3),
                     stop_words='english',
                        tokenizer=tokenize_text)
X_description = tf_idf.fit_transform(all_data['item_description'])

In [74]:
X_description.shape

(2175894, 5135)

In [75]:
X_name = tf_idf.fit_transform(all_data['name'])
X_name.shape

(2175894, 1413)

In [77]:
X_numeric = all_data[['item_condition_id', 'shipping', 'is_train', 'word_count_desc', 'word_count_name', 
                      'brand_name_bin', 'category_bin', 'sub_cat1_bin', 'sub_cat2_bin']]

In [81]:
all_data['desc_clear'] = all_data.desc_tokens.apply(lambda x: ' '.join(x))
all_data['name_clear'] = all_data.name_tokens.apply(lambda x: ' '.join(x))

In [79]:
X_description_clear = tf_idf.fit_transform(all_data['desc_clear'])

KeyboardInterrupt: 

In [99]:
from sklearn.feature_extraction.text import HashingVectorizer

hash_vect = HashingVectorizer(decode_error='ignore', n_features=2 ** 16,
                               alternate_sign=False)

In [82]:
%%time

X_name = tf_idf.fit_transform(all_data['name_clear'])
X_name.shape

CPU times: user 3min 24s, sys: 488 ms, total: 3min 25s
Wall time: 3min 24s


In [100]:
%%time

X_name = hash_vect.fit_transform(all_data['name_clear'])
X_name.shape

CPU times: user 6.8 s, sys: 8 ms, total: 6.81 s
Wall time: 6.81 s


In [101]:
X_name

<2175894x65536 sparse matrix of type '<class 'numpy.float64'>'
	with 8075938 stored elements in Compressed Sparse Row format>

In [88]:
all_name_words = [arg for line in all_data.name_tokens for arg in line]

In [95]:
np.sort(np.unique(all_name_words, return_counts=True))

array([['aaa', 'aaaa', 'aaaaa', ..., '，carolina', '：）iphone', '�birthday'],
       ['1', '1', '1', ..., '997', '998', '998']],
      dtype='<U40')

In [92]:
len(all_name_words)

8097783