# Technical test for Tagwalk

Before answering the questions let's look at the dataset.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset and printing the first few lines
data = pd.read_csv('Dataset.csv')
print(data.head(7))

    slug  look   type              season         designer    city  \
0  20261     8  woman  spring-summer-2017         roksanda  london   
1  20262     9  woman  spring-summer-2017         roksanda  london   
2  20263    10  woman  spring-summer-2017         roksanda  london   
3  20264    11  woman  spring-summer-2017         roksanda  london   
4  20265    12  woman  spring-summer-2017         roksanda  london   
5  20255     1  woman  spring-summer-2017         roksanda  london   
6  19778     2  woman  spring-summer-2017  mother-of-pearl  london   

            models                                           keywords  
0              NaN  ['dress', 'silk', 'purple', 'balloon-sleeves',...  
1              NaN  ['dress', 'long-dress', 'pleats', 'silk', 'sat...  
2              NaN  ['dress', 'midi-dress', 'print', 'sleeveless',...  
3              NaN  ['dress', 'midi-dress', 'silk', 'satin', 'v-ne...  
4              NaN  ['coat', 'silk', 'satin', 'gold', 'elastic', '...  
5  ['ka

### Question 1
Calculate the total number of looks by [type, season] and by [type, season, designer].

In [3]:
# Finding the number of unique entries for the 'type', 'season', and 'designer' columns

print('Unique types:', data['type'].nunique(), data['type'].unique()), print(' ')
print('Unique seasons:', data['season'].nunique(), data['season'].unique()), print(' ')
print('Unique designers:', data['designer'].nunique(), data['designer'].unique())

Unique types: 1 ['woman']
 
Unique seasons: 2 ['spring-summer-2017' 'spring-summer-2018']
 
Unique designers: 59 ['roksanda' 'mother-of-pearl' 'versus-versace' 'anya-hindmarch' 'topshop'
 'jw-anderson' 'simone-rocha' 'christopher-kane' 'david-koma'
 'house-of-holland' 'gareth-pugh' 'emilia-wickstead' 'joseph' 'erdem'
 'mary-katrantzou' 'peter-pilotto' 'paul-smith' 'teatum-jones' 'bora-aksu'
 'fashion-east' 'temperley-london' 'jasper-conran' 'fyodor-golan'
 'marta-jakubowski' 'julien-macdonald' 'toga' 'rejina-pyo'
 'ashley-williams' 'emporio-armani' 'ashish' 'burberry' 'antonio-berardi'
 'eudon-choi-1' 'ryan-lo' 'marques-almeida' 'pringle-of-scotland'
 'ports-1961' 'molly-goddard' 'chalayan' 'paula-knorr' 'roland-mouret'
 'sharon-wauchob' 'richard-malone' 'halpern' 'daks' 'nicopanda' 'lunyee'
 'emilio-de-la-morena' 'margaret-howell' 'palmer-harding' 'mulberry'
 'a-w-a-k-e-mode' 'cividini' 'pam-hogg' 'mila-schon'
 'preen-by-thornton-bregazzi' 'barbara-casasola' 'osman' 'natasha-zinko']


In [4]:
# Total number of looks by [type, season]. 
#(As 'type' has only one unique value, no need for additional grouping for this column)

data['number of looks'] = 1  # Addind a column
looks_season = data.groupby(['season']).count()['number of looks']
print(looks_season)

season
spring-summer-2017    1315
spring-summer-2018    2044
Name: number of looks, dtype: int64


In [5]:
# Saving results from above as a .csv file
looks_season.to_csv('num_looks_season.csv', index=True)

In [6]:
# Total number of looks by [type, season, designer]. 

looks_season_designer = data.groupby(['season','designer']).count()['number of looks']
print(looks_season_designer)

season              designer        
spring-summer-2017  antonio-berardi     43
                    anya-hindmarch      40
                    ashish              33
                    barbara-casasola    29
                    bora-aksu           29
                                        ..
spring-summer-2018  simone-rocha        40
                    teatum-jones        29
                    temperley-london    38
                    toga                40
                    versus-versace      49
Name: number of looks, Length: 89, dtype: int64


In [7]:
# Saving results from above to a .csv file
looks_season_designer.to_csv('num_looks_season_designer.csv', index=True)

In [8]:
# To better compare the number of looks by each designer per season let's pivot the season column.
# (NaN replaced by zeros)

looks_season_designer_pivot = data.groupby(['designer', 'season']).count().unstack()['number of looks'].fillna(0)
looks_season_designer_pivot.head(10)

season,spring-summer-2017,spring-summer-2018
designer,Unnamed: 1_level_1,Unnamed: 2_level_1
a-w-a-k-e-mode,0.0,57.0
antonio-berardi,43.0,43.0
anya-hindmarch,40.0,40.0
ashish,33.0,38.0
ashley-williams,0.0,27.0
barbara-casasola,29.0,0.0
bora-aksu,29.0,37.0
burberry,78.0,84.0
chalayan,0.0,45.0
christopher-kane,38.0,47.0


In [9]:
# Saving results from above to a .csv file
looks_season_designer_pivot.to_csv('num_looks_season_designer_pivot.csv', index=True)

### Question 2
Calculate the percentage of presence for each tag by type and season, and rank them in descending order.

In [10]:
# First, let's convert every rows of the 'keyword' colums from string to list and add every tag into one list

all_tags = []
for i in data['keywords']:
    list_i = eval(i)
    for j in list_i:
        all_tags.append(j)

In [11]:
# Now we can find all the unique tags

unique_tags = []
for i in all_tags:
    if i not in unique_tags:
        unique_tags.append(i)

print(len(unique_tags), 'unique tags found in the dataset')

993 unique tags found in the dataset


In [12]:
# Counting the persentage of presence for each tag in 2017 and 2018 as well as for the whole dataset

data2017 = data[data['season'] == 'spring-summer-2017']
data2018 = data[data['season'] == 'spring-summer-2018']

lines_num = len(data['keywords'])
tag_cnt = []
tag_cnt_2017 = []
tag_cnt_2018 = []
for j in range(len(unique_tags)):
    tag_cnt.append(unique_tags[j]+ ' ' + str(round(len(data.loc[data['keywords'].str.contains(unique_tags[j])]['keywords'])/lines_num*100,2))  )
    cnt_2017 = len(data2017.loc[data2017['keywords'].str.contains(unique_tags[j])]['keywords'])/len(data2017)*100
    cnt_2018 = len(data2018.loc[data2018['keywords'].str.contains(unique_tags[j])]['keywords'])/len(data2018)*100
    tag_cnt_2017.append(unique_tags[j]+ ' ' + str(round(cnt_2017,2)))
    tag_cnt_2018.append(unique_tags[j]+ ' ' + str(round(cnt_2018,2)))

In [13]:
# Splitting the tag from the corresponding persentage

def TagSplit(tag):
    tag_split = []
    for i in tag:
        i.split()
        tag_split.append(i.split())
    return tag_split

tag_cnt_split = TagSplit(tag_cnt)
tag_cnt_2017_split = TagSplit(tag_cnt_2017)
tag_cnt_2018_split = TagSplit(tag_cnt_2018)

In [14]:
# Sorting the tags in descending order

def Sort(tag):
    tag.sort(reverse=True, key = lambda x: float(x[1]))
    return tag

tag_cnt_split_ranked = Sort(tag_cnt_split)

In [15]:
# Converting lists into pandas data frames and merging them into one

df_tags = pd.DataFrame(tag_cnt_split_ranked, columns=['tag','total pct of presence'])
df_tags_2017 = pd.DataFrame(tag_cnt_2017_split, columns=['tag','pct of presence in 2017'])
df_tags_2018 = pd.DataFrame(tag_cnt_2018_split, columns=['tag','pct of presence in 2018'])

tags_merged = pd.merge(pd.merge(df_tags, df_tags_2017, on='tag', how = 'left'), df_tags_2018, on='tag', how = 'left')
tags_merged = tags_merged.astype({'total pct of presence':'float','pct of presence in 2017':'float','pct of presence in 2018':'float'})

tags_merged.head(5)

Unnamed: 0,tag,total pct of presence,pct of presence in 2017,pct of presence in 2018
0,dress,48.82,51.48,47.11
1,black,28.28,25.78,29.89
2,white,27.09,23.04,29.7
3,pants,26.59,24.26,28.08
4,midi,22.54,25.25,20.79


In [16]:
# Saving the results to a .csv file

# Saving results from above to a .csv file
tags_merged.to_csv('tags.csv', index=False)

### Question 3
Calculate the evolution from one season to another and rank them in descending order.

In [17]:
# Evolution of total number of looks from one season to another

looks_season_change = round((looks_season.iloc[1] - looks_season.iloc[0])/looks_season.iloc[0]*100,1)
looks_season_change

if looks_season_change > 0:
    print('The number of looks increased by %s percent in the spring-summer-2018 season compared to the spring-summer-2017 season.' % looks_season_change)
elif looks_season_change < 0:
    print('The number of looks decreased by %s percent in the spring-summer-2018 season compared to the spring-summer-2017 season.' % looks_season_change)
else: print('The number of looks did not change.')

The number of looks increased by 55.4 percent in the spring-summer-2018 season compared to the spring-summer-2017 season.


In [18]:
# Evolution of total number of looks by designer from one season to another

num_looks_change = []
for i in range(len(looks_season_designer_pivot)):
    season2017 = looks_season_designer_pivot.iloc[i].iloc[0]
    season2018 = looks_season_designer_pivot.iloc[i].iloc[1]
    num_looks_change.append(season2018 - season2017)

looks_season_designer_pivot['Number of looks change'] = num_looks_change

In [19]:
# Evolution of total number of looks by designer from one season to another (percentage change)

pct_looks_change = []
for i in range(len(looks_season_designer_pivot)):
    season2017 = looks_season_designer_pivot.iloc[i].iloc[0]
    season2018 = looks_season_designer_pivot.iloc[i].iloc[1]
    
    if season2017 == 0 or season2018 == 0:
        pct_looks_change.append('--')
    else:
        pct_looks_change.append(round((season2018-season2017)/season2017*100,2))

looks_season_designer_pivot['Percentage of looks change'] = pct_looks_change

looks_season_designer_pivot_sorted = looks_season_designer_pivot.sort_values(by='Number of looks change',ascending=False)

looks_season_designer_pivot_sorted.head(5)

season,spring-summer-2017,spring-summer-2018,Number of looks change,Percentage of looks change
designer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
emporio-armani,0.0,113.0,113.0,--
a-w-a-k-e-mode,0.0,57.0,57.0,--
margaret-howell,0.0,53.0,53.0,--
fashion-east,0.0,46.0,46.0,--
chalayan,0.0,45.0,45.0,--


In [20]:
# Saving as a .csv file

looks_season_designer_pivot_sorted.to_csv('num_looks_season_designer_pivot_change.csv', index=True)

In [21]:
# Calculating a deference from one season to another

tags_merged['Absolute change'] = round(tags_merged['pct of presence in 2018'] - tags_merged['pct of presence in 2017'],2)
tags_merged['Relative change'] = round((tags_merged['pct of presence in 2018'] - tags_merged['pct of presence in 2017'])/tags_merged['pct of presence in 2017']*100,2)
tags_merged.head(10)

Unnamed: 0,tag,total pct of presence,pct of presence in 2017,pct of presence in 2018,Absolute change,Relative change
0,dress,48.82,51.48,47.11,-4.37,-8.49
1,black,28.28,25.78,29.89,4.11,15.94
2,white,27.09,23.04,29.7,6.66,28.91
3,pants,26.59,24.26,28.08,3.82,15.75
4,midi,22.54,25.25,20.79,-4.46,-17.66
5,shirt,20.81,19.24,21.82,2.58,13.41
6,ski,20.42,20.68,20.25,-0.43,-2.08
7,skirt,20.04,20.46,19.77,-0.69,-3.37
8,pattern,19.5,9.43,25.98,16.55,175.5
9,print,18.96,15.97,20.89,4.92,30.81


In [22]:
# Saving results from above to a .csv file
tags_merged.to_csv('tags_change.csv', index=False)

### Question 4
Export the results of the first 3 questions to .csv files and create a Tableau dashboard to visually represent the first three questions. Add 2 selectable filters to the dashboard: seasons and keywords. To use Tableau Public, you need to create an account here: https://www.tableau.com/fr-fr/products/public

In [23]:
# The .csv files files are exported above.

### Some insights

*  Dataset contains information on womenswear for two seasons (spring-summer 2017 and spring-summer 2018).
*  The number of looks in 2018 increased by about 55% compared to 2017.
*  Looks from a total of 59 designers are presented in the dataset. Among them, 24 designers are featured only at one of the seasons.
*  The dataset has almost 1000 unique tags. The most popular tags are: 'dress', 'black', 'white'.
*  Tags with the largest absolute increase of presence are 'pattern', 'long-sleeves', and 'head-accessory'. Tags with the largest absolute decrease of presence are 'embroidery', 'floral', and 'midi-dress'.
*  Tags with one the most significant relative increase are 'texture', 'drawstring', 'button'. Tags with one of the most significant relative decrease are 'silk', 'folk', 'folkloric'.