## Data preparation

In [1]:
import pandas as pd
import numpy as np

In [9]:
data = pd.read_csv("C:/Users/rafal/Desktop/DS/Merceri price/data_selected.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,775838,775838,Chanel Coco Mademoiselle,1,Beauty/Fragrance/Women,Chanel,81.0,0,It's brand new and 100% authentic. This is a g...
1,984982,984982,Young living Raven,3,Other/Daily & Travel items/Health Care,,13.0,0,Partially used as shown in pic
2,1144303,1144303,Nwt Northface Reversible Jacket xs 6,1,Kids/Girls (4+)/Coats & Jackets,The North Face,56.0,0,Girls extra small xs 6 Northface jacket Super ...
3,1222558,1222558,Lauren Conrad knit sweater!,2,Women/Sweaters/Crewneck,LC Lauren Conrad,8.0,1,Like new! Light pink!
4,1110435,1110435,ONE PIECE bathing suit,2,Women/Swimwear/One-Piece,,17.0,0,This is just for the one piece! Size small/medium


In [10]:
data.drop(columns = ['Unnamed: 0', 'train_id'], axis = 1, inplace = True)

### Categories

In [11]:
def StringToColumns(data, column_name, sep, split_limit, fill_na = "No_category"):
    categories = data[column_name].str.split(sep, split_limit, expand = True)
    categories = categories.drop(columns = 3, axis = 1)
    categories = categories.fillna(value = fill_na)
    data_cats = pd.concat([data, categories], axis = 1)
    data_features = data_cats.drop(columns = [column_name], axis = 1)
    return data_features

def WordsToColumns(data, max_columns, column_number):
    import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    from sklearn.feature_extraction.text import CountVectorizer
    corpus = []
    for i in range(0, len(data)):
        name = re.sub('[^a-zA-Z]', ' ', data.iloc[i, column_number])
        name = name.lower()
        name = name.split()
        ps = PorterStemmer()
        name = [ps.stem(word) for word in name if not word in set(stopwords.words('english'))]
        name = ' '.join(name)
        corpus.append(name)    
    # Creating the Bag of Words model
    cv = CountVectorizer(max_features = max_columns)
    name_features = cv.fit_transform(corpus)
    df_name_features = pd.DataFrame(name_features.toarray())
    return df_name_features

def SelectedLevelsToColumns_count(data, target, categorical, percentage=0.001, na_fill = 'no_brand_name'):
    from sklearn.preprocessing import OneHotEncoder
    pivot_brands = pd.pivot_table(data, values = target, index = categorical, aggfunc = ['count'])
    pivot_brands.columns = pivot_brands.columns.get_level_values(0)
    pv_sorted = pivot_brands.sort_values(by = ['count'], ascending = False)
    selected_brands = pv_sorted[pv_sorted['count']>percentage*len(data)]
    brand_list = selected_brands.index
    brand_list = np.array(brand_list)
    ohe = OneHotEncoder()
    cat_data = pd.DataFrame(data[categorical])
    cat_data.fillna(value = na_fill, inplace = True)
    data['new_categorical'] = np.where(data[categorical].isin(brand_list), data[categorical], 'other')
    categories = ohe.fit_transform(pd.DataFrame(data['new_categorical']))
    data.drop(columns=['new_categorical', categorical], axis = 1, inplace = True)
    data.reset_index(drop = True, inplace = True)
    data_final = pd.concat([data, pd.DataFrame(categories.toarray())], axis = 1)
    #Return also the brand_list for preprocesssing the test set
    return data_final

In [12]:
data = StringToColumns(data, 'category_name', '/', 3)

In [13]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
data_categories = ohe.fit_transform(data[[0, 1, 2]])

In [14]:
data_categories = pd.DataFrame(data_categories.toarray())

In [15]:
data = pd.concat([data, data_categories], axis = 1)

In [16]:
data.drop(columns = [0,1,2], axis = 1, inplace=True)

### Name

In [71]:
data_name = WordsToColumns(data, 1000, 0)

In [72]:
data_name.to_csv("C:/Users/rafal/Desktop/DS/Merceri price/data_name_preprocessed.csv")

In [17]:
data_name = pd.read_csv("C:/Users/rafal/Desktop/DS/Merceri price/data_name_preprocessed.csv")

In [18]:
data_with_name = pd.concat([data, data_name], axis = 1)

In [19]:
data_with_name.head()

Unnamed: 0,name,item_condition_id,brand_name,price,shipping,item_description,3,4,5,6,...,990,991,992,993,994,995,996,997,998,999
0,Chanel Coco Mademoiselle,1,Chanel,81.0,0,It's brand new and 100% authentic. This is a g...,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,Young living Raven,3,,13.0,0,Partially used as shown in pic,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,Nwt Northface Reversible Jacket xs 6,1,The North Face,56.0,0,Girls extra small xs 6 Northface jacket Super ...,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Lauren Conrad knit sweater!,2,LC Lauren Conrad,8.0,1,Like new! Light pink!,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,ONE PIECE bathing suit,2,,17.0,0,This is just for the one piece! Size small/medium,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
data_with_name.drop(columns= 'name', axis = 1, inplace=True)

### Brand name

In [22]:
data_with_brand_name = SelectedLevelsToColumns_count(data_with_name, 'price', 'brand_name')

In [23]:
data_with_brand_name.head()

Unnamed: 0,item_condition_id,price,shipping,item_description,3,4,5,6,7,8,...,98,99,100,101,102,103,104,105,106,107
0,1,81.0,0,It's brand new and 100% authentic. This is a g...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,13.0,0,Partially used as shown in pic,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,56.0,0,Girls extra small xs 6 Northface jacket Super ...,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,8.0,1,Like new! Light pink!,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2,17.0,0,This is just for the one piece! Size small/medium,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Description

In [24]:
data.head()

Unnamed: 0,name,item_condition_id,brand_name,price,shipping,item_description,3,4,5,6,...,802,803,804,805,806,807,808,809,810,811
0,Chanel Coco Mademoiselle,1,Chanel,81.0,0,It's brand new and 100% authentic. This is a g...,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Young living Raven,3,,13.0,0,Partially used as shown in pic,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Nwt Northface Reversible Jacket xs 6,1,The North Face,56.0,0,Girls extra small xs 6 Northface jacket Super ...,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Lauren Conrad knit sweater!,2,LC Lauren Conrad,8.0,1,Like new! Light pink!,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ONE PIECE bathing suit,2,,17.0,0,This is just for the one piece! Size small/medium,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
data['item_description'].fillna(value = "", inplace = True)

In [None]:
data_description = WordsToColumns(data,5000,5)