In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/cartier-jewelry-catalog/cartier_catalog.csv')

In [None]:
data.head()




# Goal: 
There are 692 entries of jewelry products with description and price. Our goal is to predict the price. The main focus of this kernel is to practice data cleaning, as all the meaningful data is hidden in the text.

# Evaluation metric: 
Mean square error

We need some base estimator to compare our future work to some trivial guess. The only information available immediately is categorie. Let us make some plots. 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12,4))
sns.countplot(data.categorie, ax=axes[0])
sns.boxplot(x='categorie', y='price', data=data, ax=axes[1])
plt.ylim(0,100000)
plt.suptitle('Count and price distribution by categories')
plt.show()

The simplest guess will be to guess the average price in each category without looking into further details. Let us write such an estimator and compute its R2 score.

In [None]:
avg_by_cat = data.groupby('categorie').price.mean()

def base_est(X):
    """returns average price for each category"""
    
    X['predict'] = X.categorie.apply(lambda x: avg_by_cat[x])
    return X.predict.values    

In [None]:
r2_score(data.price, base_est(data))

Base estimator has R^2 score of 0.05, we want to improve on that.


# What to do with the data?

## Categorie
    OK, no missing values. Do not touch it.

# Title

    1) Extract most commonly appearing words, and choose the one that possibly represent the brand. 
    (we could use 2-grams, but after examining the values, there does not seem to be anything new we will get from it)

    
# Tags
    1) The tags are well organized, contain main features 
    (like presence of a diamond or a garnet, and type of metals). 
    Choose most commonly appearing tags (we'll set a threshold for >30 appearances).
    
# Description
 
    1) Extract quantative information: number of diamonds, sapphires, etc. 
        How many carat for the diamond, or size of the rings... 
    2) Extract sizes of the product (small, medium, large)
    
    Possibly, we could extract other adjectives, like type of cut of the diamonds, 
    or some other superlatives in the description. However, the dataset is very small,
    and we already have many featuresm and will overfit, so I stopped here.
    
    
# Image:
    It is possible that the directories in which image is saved, 
    contain some valuable classification information 
    (perhaps all luxurious brands are saved in the same directory).
    I will not use it, as it is sort of cheating, but one can try 
    to train a model on that. 

## Title column

In [None]:
corpus = []
for i in data.index:
    tokens = nltk.word_tokenize(data.title[i])
    corpus += list(tokens)
    
word_counts = Counter(corpus)
word_counts.most_common(20)

After looking at some entries in Title column, many of these words always appear together, like 'de Cartier' or 'Juste Clou'. We also remove the words like 'diamonds', 'bracelet', 'ring' for which the information will be in other columns.

In [None]:
common_words = ['Cartier', 'Panthère', 'wedding', 'Love', 'Trinity', 'Juste']
for word in common_words:
    data[word] = data.title.apply(lambda x: int(word in nltk.word_tokenize(x)))

## Working on Description column:

Tokenize and determine parts of speech. We will write a function that each time it finds a number in the sentence, it will look for the next noun to understand the meaning of that number. Some quantities like the width are given before the number, so we will have the option to look for previous noun.

In [None]:
data['descr_tok'] = data.description.apply(lambda x: nltk.word_tokenize(x))
data['descr_pos'] = data.descr_tok.apply(lambda x: nltk.pos_tag(x))

In [None]:
def extract(string, nouns, kind='count', following=True):
    """extracts a number such that noun follows it, i.e. 3 beautiful diamonds.
       accepts list of nouns(need plural and singular)
       kind - mostly used for count, but for some cases we need to use mean"""
    
    count = []

    for num, word in enumerate(string):
        if word[1]=='CD':
            if find_next_noun(string, num, following) in nouns:
                if word[0] in ['one', 'One']:
                    numerical = [1]
                elif word[0] in ['two', 'Two']:
                    numerical = [2]
                elif word[0] in ['three', 'Three']:
                    numerical = [3]
                elif word[0] in ['four', 'Four']:
                    numerical = [4]
                elif word[0] in ['five', 'Five']:
                    numerical = [5]
                else:    
                    numerical =  re.findall('\d+.\d+|\d+', word[0])
                if len(numerical):
                    try:
                        count.append(float(numerical[0]))
                    except ValueError:
                        pass
        if (word[0] in ['a', 'one']) and following==True:
            if find_next_noun(string, num, following) in nouns:
                count.append(1)                    
                
    if len(count)==0:
        count=[0]
    if kind=='count':
        return np.sum(count)
    if kind=='mean':
        return np.mean(count)

In [None]:
def find_next_noun(string, position, following=True):
    """finds next noun appearing after position in pos_tagged string, 
       if following==False, finds preceding noun """
    
    step = 1
    if following == False:
        step=-1
    i=position+step
    while (i<len(string)) and (i>-1):
        if string[i][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            return string[i][0]
        i=i+step

In [None]:
# Extract the numerical info

data['num_diamonds'] = data.descr_pos.apply(lambda x: extract(x, ['diamond', 'diamonds'], kind='count'))
data['carat'] = data.descr_pos.apply(lambda x: extract(x, ['carat', 'carats'], kind='count'))
data['purity'] = data.descr_pos.apply(lambda x: extract(x, ['gold'], kind='mean'))
data['width'] = data.descr_pos.apply(lambda x: extract(x, ['width', 'Width'], kind='mean', following=False))
data['num_garnets'] = data.descr_pos.apply(lambda x: extract(x, ['garnet', 'garnets'], kind='count'))
data['num_sapphires'] = data.descr_pos.apply(lambda x: extract(x, ['sapphire', 'sapphires'], kind='count'))
data['num_emeralds'] = data.descr_pos.apply(lambda x: extract(x, ['emerald', 'emeralds'], kind='count'))

In [None]:
data.purity.unique()

Platinum is always 950 from observing the data, and purity is always 18K. So we can as well remove this data. Other columns are more diverse.

In [None]:
data.drop(columns=['purity'], inplace=True)

## Room for imporvement here

It is possible, that some of the values are read incorrectly. We could double check using regex the width, because it appears often with the units (mm) or purity which appears as 18K (with K in the end) and other values. However, I felt lazy and was happy that everything works using just one function.

Moreover, in some descriptions, it just says sapphires and emeralds, without specifying the number, while it is clearly more than one. We could treat it as missing value, and fill it with most common number of sapphires or emeralds, but currently it is filled by 0.

In [None]:
# Extract sizes

small = ['small', 'Small', 'S', 'XS', 'xs']
medium = ['medium', 'Medium', 'M']
large = ['large', 'Large', 'L', 'XL', 'big', 'Big']

def size_encoder(string, size):
    return  int(len(set(size) & set(nltk.word_tokenize(string)))>0)

data['small'] = data.description.apply(lambda x: size_encoder(x, small))
data['medium'] = data.description.apply(lambda x: size_encoder(x, medium))
data['large'] = data.description.apply(lambda x: size_encoder(x, large))

## Tags column

In [None]:
# Obtaining a list of most common tags from the data

tag_set = []
for tags in data.tags.values:
    tag_set += tags.split(',')

tag_count = Counter(tag_set)    
common_tags = [ word for (word, count) in tag_count.most_common(15)]    

In [None]:
# One hot encoding of tags
for tag in common_tags:
    data[tag] = data.tags.apply(lambda x: int(tag in x))

In [None]:
# One hot encoding of categories.

data = pd.get_dummies(data=data, columns=['categorie'], drop_first=True)
#data['rings'] = data.categorie.apply(lambda x: int(x=='rings'))
#data['bracelets'] = data.categorie.apply(lambda x: int(x=='bracelets'))
#data['necklaces'] = data.categorie.apply(lambda x: int(x=='necklaces'))

In [None]:
df = data.drop(columns=['ref', 'title', 'tags', 'description', 'image', 'descr_tok', 'descr_pos'])

X_train, X_test, y_train, y_test = train_test_split(
                    df.drop(columns=['price']), df.price, test_size=0.3, random_state=42)

In [None]:
regr = RandomForestRegressor(n_estimators=100, max_depth=20)
regr.fit(X_train,y_train)
regr.score(X_train,y_train)

In [None]:
regr.score(X_test,y_test)

# R2 - Score: ~80

Fine tuning might improve it. 