# Project: Category prediction of IKEA furniture

## Data Visualization

In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline
plt.style.use('ggplot') # emulate pretty r-style plots

In [14]:
#read the data and delete the index of entry
IKEA_df = pd.read_csv('IKEA_SA_Furniture_Web_Scrapings_sss.csv')
IKEA_df = IKEA_df.drop(IKEA_df.columns[0],axis =1)

In [15]:
IKEA_df.shape

(3694, 13)

We have 3694 samples and 12 features together with target catrgory as follows:

In [16]:
# data type  
IKEA_df.dtypes

item_id                int64
name                  object
category              object
price                float64
old_price             object
sellable_online         bool
link                  object
other_colors          object
short_description     object
designer              object
depth                float64
height               float64
width                float64
dtype: object

In [17]:
IKEA_df.count()

item_id              3694
name                 3694
category             3694
price                3694
old_price            3694
sellable_online      3694
link                 3694
other_colors         3694
short_description    3694
designer             3694
depth                2231
height               2706
width                3105
dtype: int64

We can see some data is missing in feature depth, height and width 

In [18]:
# maybe make this into a pretty pie/bar chart
IKEA_df.describe().round(2)

Unnamed: 0,item_id,price,depth,height,width
count,3694.0,3694.0,2231.0,2706.0,3105.0
mean,48632396.79,1078.21,54.38,101.68,104.47
std,28887094.1,1374.65,29.96,61.1,71.13
min,58487.0,3.0,1.0,1.0,1.0
25%,20390574.0,180.9,38.0,67.0,60.0
50%,49288078.0,544.7,47.0,83.0,80.0
75%,70403572.75,1429.5,60.0,124.0,140.0
max,99932615.0,9585.0,257.0,700.0,420.0


In [19]:
nun =  IKEA_df['category'].nunique()
print(IKEA_df['category'].unique())
print('There are ' + str(nun) + ' categories in total' )

['Bar furniture' 'Beds' 'Bookcases & shelving units'
 'Cabinets & cupboards' 'Café furniture' 'Chairs'
 'Chests of drawers & drawer units' "Children's furniture"
 'Nursery furniture' 'Outdoor furniture' 'Room dividers'
 'Sideboards, buffets & console tables' 'Sofas & armchairs'
 'Tables & desks' 'Trolleys' 'TV & media furniture' 'Wardrobes']
There are 17 categories in total


In [20]:
nun =  IKEA_df['designer'].nunique()
#print(IKEA_df['designer'].unique())
print('There are ' + str(nun) + ' designers in total' )

There are 381 designers in total


In [21]:
IKEA_df.groupby(['designer','category'])['item_id'].count().reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(10)

Unnamed: 0,designer,category,count
374,IKEA of Sweden,Bookcases & shelving units,256
375,IKEA of Sweden,Cabinets & cupboards,96
386,IKEA of Sweden,Tables & desks,89
385,IKEA of Sweden,TV & media furniture,78
259,Ehlén Johansson,Sofas & armchairs,74
377,IKEA of Sweden,Chairs,68
427,IKEA of Sweden/Ehlén Johansson,Wardrobes,62
273,Ehlén Johansson/IKEA of Sweden,Wardrobes,61
307,Francis Cayouette,Sofas & armchairs,53
753,Ola Wihlborg,Sofas & armchairs,48


In [22]:
IKEA_df['designer_len'] = IKEA_df['designer'].str.len() 
IKEA_df['designer_len'].describe()

count    3694.000000
mean       33.755820
std        86.016629
min         3.000000
25%        14.000000
50%        15.000000
75%        28.000000
max      1261.000000
Name: designer_len, dtype: float64

In [23]:
IKEA_df[IKEA_df['designer_len'] > 50].shape

(166, 14)

In [24]:
IKEA_df[IKEA_df['designer_len'] > 50].head(2)

Unnamed: 0,item_id,name,category,price,old_price,sellable_online,link,other_colors,short_description,designer,depth,height,width,designer_len
51,50468953,VATTVIKEN,Beds,995.0,No old price,True,https://www.ikea.com/sa/en/p/vattviken-armchai...,No,Armchair-bed,504.689.53 Small and easy-to-place chair-bed w...,83.0,86.0,92.0,403
77,90331091,RIDABU,Beds,100.0,SR 125,True,https://www.ikea.com/sa/en/p/ridabu-mirror-doo...,No,"Mirror door, 40x120 cm",903.310.91 The door can be hung with the openi...,,120.0,40.0,199


We can see some data in desinger is actually long description of the furniture, we can replace them with "designer unknowm", but  this might cause to misleading that those furniture are designed by the same person, so considering there are not much of them, we keep the origin ones as designer. 

In [25]:
# correlation / scatter matrix here

## Data Cleaning

In [26]:
#convert object to boolean
IKEA_df['other_colors'] = IKEA_df['other_colors'].map({'Yes': True, 'No': False})

With following observations, we can conclude that all NaN in dataset concerning the data size

In [27]:
IKEA_df[IKEA_df.isnull().any(axis=1)].shape

(1795, 14)

In [28]:
size = ['depth','height','width']
IKEA_df[IKEA_df[size].isnull().any(axis=1)].shape

(1795, 14)

In [29]:
IKEA_df[IKEA_df[IKEA_df.columns[~IKEA_df.columns.isin(size)]].isnull().any(axis=1)].shape

(0, 14)

In [30]:
IKEA_df[IKEA_df[size].isnull().all(axis=1)].head(40)

Unnamed: 0,item_id,name,category,price,old_price,sellable_online,link,other_colors,short_description,designer,depth,height,width,designer_len
2,9333523,NORDVIKEN / NORDVIKEN,Bar furniture,2095.0,No old price,False,https://www.ikea.com/sa/en/p/nordviken-nordvik...,False,Bar table and 4 bar stools,Francis Cayouette,,,,17
9,69304221,EKEDALEN / EKEDALEN,Bar furniture,2176.0,"SR 2,375",True,https://www.ikea.com/sa/en/p/ekedalen-ekedalen...,False,Bar table and 4 bar stools,Ehlén Johansson,,,,15
28,29297227,STENSELE / NORRARYD,Bar furniture,1340.0,No old price,True,https://www.ikea.com/sa/en/p/stensele-norraryd...,False,Bar table and 2 bar stools,Nike Karlsson/Maja Ganszyniec,,,,29
32,19297275,NORRÅKER / NORRÅKER,Bar furniture,1266.0,"SR 1,385",True,https://www.ikea.com/sa/en/p/norraker-norraker...,False,"Bar table and 2 bar stools, 7...",J Karlsson/N Karlsson,,,,21
34,20336619,HENRIKSDAL,Bar furniture,40.0,No old price,True,https://www.ikea.com/sa/en/p/henriksdal-cover-...,False,Cover for bar stool with backrest,IKEA of Sweden,,,,14
38,29304223,EKEDALEN / EKEDALEN,Bar furniture,2375.0,No old price,True,https://www.ikea.com/sa/en/p/ekedalen-ekedalen...,False,"Bar table and 4 bar stools, 1...",Ehlén Johansson,,,,15
39,49304217,EKEDALEN / EKEDALEN,Bar furniture,2176.0,"SR 2,375",True,https://www.ikea.com/sa/en/p/ekedalen-ekedalen...,False,"Bar table and 4 bar stools, 1...",Ehlén Johansson,,,,15
42,89297272,NORRÅKER / NORRARYD,Bar furniture,1266.0,"SR 1,385",True,https://www.ikea.com/sa/en/p/norraker-norraryd...,False,"Bar table and 2 bar stools, 7...",Nike Karlsson/J Karlsson/N Karlsson,,,,35
45,49297274,NORRÅKER / NORRÅKER,Bar furniture,1266.0,"SR 1,385",True,https://www.ikea.com/sa/en/p/norraker-norraker...,False,"Bar table and 2 bar stools, 7...",J Karlsson/N Karlsson,,,,21
104,44361010,BRYNE,Beds,49.0,No old price,True,https://www.ikea.com/sa/en/p/bryne-net-white-4...,False,Net,443.610.10 Easy to keep clean since you can re...,,,,257


### NaN in size data

According to our observation to the dataset, compared with the product size shown in the website, we have the following conclusions. There are several main reasons why size data is missing \.
 First apparently some data are not successfully scraped from the website and neither be recorded in the short description, it is impossible to recover those. \
Second, some items are a set of different furniture, so there is no product size. \
Third, some items have variable size, for example, its width can be 190 to 220 cm. Although those data are recorded in short description, its hard to extract the exact size for all those items. The biggest problem is that we can’t tell to which of three, depth, height and width , the number belongs, since often that data is incomplete in short description.\
The last, there are some small items which measured in mm instead of cm and instead of height, the diameter is measured and recorded in short description.


In [31]:
#split description and size data in feature short description
IKEA_df['short_description'].str.rsplit(',', n=1,expand = True)
IKEA_df['size']= IKEA_df['short_description'].str.rsplit(',', n=1).str[1]
IKEA_df['short_description']= IKEA_df['short_description'].str.rsplit(',', n=1).str[0]

In [32]:
#split numbers and unit
IKEA_df['unit'] = IKEA_df['size'].str.replace('\d+|-|x', ' ')
IKEA_df['size_nounit'] = IKEA_df['size'].str.replace('cm|mm', '')

Following result shows as we find that indeed all small items don’t have height and considering it is convenient to extract the single numbers in short description representing the diameter, we fill the NaN in height of small items with data recorded in short description. Furthermore, we find that size data of small items, of which the diameter is measured with mm, is actually in mm according to the link, therefore we correct those.  

In [33]:
IKEA_df[IKEA_df['unit'].str.contains('mm') & IKEA_df[size].isnull().any(axis=1)].shape

(132, 17)

In [34]:
IKEA_df[IKEA_df['unit'].str.contains('mm') & IKEA_df['height'].isnull()].shape

(132, 17)

In [35]:
IKEA_df['height'] = np.where(IKEA_df['unit'].str.contains('mm') , IKEA_df.size_nounit, IKEA_df.height)
IKEA_df['height'] = np.where(IKEA_df['unit'].str.contains('mm'), IKEA_df.height.astype(float)/10, IKEA_df.height)
IKEA_df['depth'] = np.where(IKEA_df['unit'].str.contains('mm'), IKEA_df.depth.astype(float)/10, IKEA_df.depth)
IKEA_df['width'] = np.where(IKEA_df['unit'].str.contains('mm'), IKEA_df.width.astype(float)/10, IKEA_df.depth)

In [36]:
IKEA_df

Unnamed: 0,item_id,name,category,price,old_price,sellable_online,link,other_colors,short_description,designer,depth,height,width,designer_len,size,unit,size_nounit
0,90420332,FREKVENS,Bar furniture,265.0,No old price,True,https://www.ikea.com/sa/en/p/frekvens-bar-tabl...,False,"Bar table, in/outdoor",Nicholai Wiig Hansen,,99,,20,51x51 cm,cm,51x51
1,368814,NORDVIKEN,Bar furniture,995.0,No old price,False,https://www.ikea.com/sa/en/p/nordviken-bar-tab...,False,Bar table,Francis Cayouette,,105,,17,140x80 cm,cm,140x80
2,9333523,NORDVIKEN / NORDVIKEN,Bar furniture,2095.0,No old price,False,https://www.ikea.com/sa/en/p/nordviken-nordvik...,False,Bar table and 4 bar stools,Francis Cayouette,,,,17,,,
3,80155205,STIG,Bar furniture,69.0,No old price,True,https://www.ikea.com/sa/en/p/stig-bar-stool-wi...,True,Bar stool with backrest,Henrik Preutz,50.0,100,50.0,13,74 cm,cm,74
4,30180504,NORBERG,Bar furniture,225.0,No old price,True,https://www.ikea.com/sa/en/p/norberg-wall-moun...,False,Wall-mounted drop-leaf table,Marcus Arvonen,60.0,43,60.0,14,74x60 cm,cm,74x60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3689,99157902,ELVARLI,Wardrobes,750.0,SR 820,True,https://www.ikea.com/sa/en/p/elvarli-1-section...,False,1 section,Ehlén Johansson,50.0,,50.0,15,92x51x222-350 cm,cm,92x51x222-350
3690,9158152,ELVARLI,Wardrobes,1572.0,"SR 1,755",True,https://www.ikea.com/sa/en/p/elvarli-2-section...,False,2 sections,Ehlén Johansson,50.0,,50.0,15,135x51x222-350 cm,cm,135x51x222-350
3691,59157541,ELVARLI,Wardrobes,924.0,"SR 1,050",True,https://www.ikea.com/sa/en/p/elvarli-2-section...,False,2 sections,Ehlén Johansson,50.0,,50.0,15,175x51x222-350 cm,cm,175x51x222-350
3692,89157573,ELVARLI,Wardrobes,2745.0,"SR 3,130",True,https://www.ikea.com/sa/en/p/elvarli-3-section...,False,3 sections,Ehlén Johansson,50.0,,50.0,15,178x51x222-350 cm,cm,178x51x222-350
