<a href="https://colab.research.google.com/github/ShaunakSen/Data-Science-and-Machine-Learning/blob/master/Colors_dataset_consolidated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import libraries

In [100]:
!pip install python-Levenshtein
!pip install fuzzywuzzy



In [101]:
from urllib.request import urlopen
import json
import os
import pickle
import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz

### Read the JSON data file

In [102]:
with open('./sample_data/colors_data.json') as f:
  data = json.load(f)

In [103]:
data['colors'][0]

{'hex': '#c93f38',
 'hsl': {'h': 2.896551724137933,
  'l': 50.3921568627451,
  's': 57.31225296442687},
 'luminance': 70.85364929204422,
 'name': '100 Mph',
 'rgb': {'b': 56, 'g': 63, 'r': 201}}

In [104]:
color_names = [color['name'].lower().strip() for color in data['colors']]
print (len(color_names))

25611


### Read csv 1 data file

In [105]:
color_csv1 = pd.read_csv('./sample_data/colorhexa_com.csv')
color_csv1

Unnamed: 0,Name,Hex (24 bit),Red (8 bit),Green (8 bit),Blue (8 bit),Hue (degrees),HSL.S (%),HSL.L (%)
0,Air Force blue,#5d8aa8,93,138,168,204.0,30.1,51.2
1,Alice blue,#f0f8ff,240,248,255,208.0,100.0,97.1
2,Alizarin crimson,#e32636,227,38,54,354.9,77.1,52.0
3,Almond,#efdecd,239,222,205,30.0,51.5,87.1
4,Amaranth,#e52b50,229,43,80,348.1,78.2,53.3
...,...,...,...,...,...,...,...,...
741,Yellow,#ffff00,255,255,0,60.0,100.0,50.0
742,Yellow Orange,#ffae42,255,174,66,34.3,100.0,62.9
743,Yellow green,#9acd32,154,205,50,79.7,60.8,50.0
744,Zaffre,#0014a8,0,20,168,232.9,100.0,32.9


In [106]:
color_names_1 = [name.lower().strip() for name in color_csv1['Name']]

print (len(color_names_1))

color_csv1['new_name'] = color_names_1

746


### Match: Simple

In [107]:
def simple_match(new_names, names_main):
    unmatched_names, unmatched_count = [], 0
    for name_ in new_names:
        if name_ not in names_main:
            unmatched_count += 1
            if name_ not in unmatched_names:
                unmatched_names.append(name_)
    return unmatched_names, unmatched_count

main_csv1_names, main_csv1_count = simple_match(color_names_1, color_names)

main_csv1_count

89

### Match: Fuzzy

Approach:

Use `fuzz.ratio` to merge the duplicates

In the final match we can use the other techniques

In [108]:
color_csv1.loc[color_csv1['new_name'] == "dark sea green"]

Unnamed: 0,Name,Hex (24 bit),Red (8 bit),Green (8 bit),Blue (8 bit),Hue (degrees),HSL.S (%),HSL.L (%),new_name
197,Dark sea green,#8fbc8f,143,188,143,120.0,25.1,64.9,dark sea green


In [109]:
def fuzzy_match(names_missing, names_main, df, main_data):
    mis_count = 0
    # for each missing color
    for mis_ in names_missing:
        mis_count+=1
        # print (f"missing name: {mis_}, {mis_count}")
        found = False
        for main_ in names_main:
            if found:
                break
            else:
                if fuzz.ratio(main_, mis_) > 90:
                    print (main_, "||", mis_)
                    found = True
        if not found:
            ### add the mis_ to the main data
            row = df.loc[df['new_name'] == mis_]
            r, g, b = int(row['Red (8 bit)']), int(row['Green (8 bit)']), int(row['Blue (8 bit)'])
            main_data['colors'].append({'name': mis_, 'rgb': {'b': b, 'g': g, 'r': r}})

    return main_data

modified_main_data = fuzzy_match(main_csv1_names, color_names, color_csv1, data)

burlwood || burlywood
dark seagreen || dark sea green
dark slate grey || dark slate gray
grey asparagus || gray asparagus
hooker's green || hooker green
lavender grey || lavender gray
pastel grey || pastel gray
payne's grey || payne grey
purple mountain majesty || purple mountain's majesty
sand brown || sandy brown
screamin' green || screamin green
storm cloud || stormcloud
terracotta || terra cotta
timber wolf || timberwolf
tropical rainforest || tropical rain forest


In [110]:
len(modified_main_data['colors'])

25685

In [111]:
modified_main_data['colors'][25680:]

[{'name': 'saint patrick blue', 'rgb': {'b': 122, 'g': 41, 'r': 35}},
 {'name': 'slate gray', 'rgb': {'b': 144, 'g': 128, 'r': 112}},
 {'name': 'taupe gray', 'rgb': {'b': 137, 'g': 133, 'r': 139}},
 {'name': 'tawny', 'rgb': {'b': 0, 'g': 87, 'r': 205}},
 {'name': 'tiger eye', 'rgb': {'b': 60, 'g': 141, 'r': 224}}]

In [112]:
color_csv1.loc[color_csv1['new_name'] == 'tawny']

Unnamed: 0,Name,Hex (24 bit),Red (8 bit),Green (8 bit),Blue (8 bit),Hue (degrees),HSL.S (%),HSL.L (%),new_name
663,Tawny,#cd5700,205,87,0,25.5,100.0,40.2,tawny


In [113]:
color_names = [color['name'].lower().strip() for color in modified_main_data['colors']]
print (len(color_names))

25685


---

## Read csv 2 data file

In [114]:
color_csv2 = pd.read_csv('./sample_data/wikipedia_color_names.csv')
color_csv2

Unnamed: 0,Name,Hex (24 bit),Red (8 bit),Green (8 bit),Blue (8 bit),Hue (degrees),HSL.S (%),"HSL.L (%), HSV.S (%), HSV.V (%)"
0,Absolute zero,#0048BA,0,72,186,217.0,100.0,37.0
1,Acid green,#B0BF1A,176,191,26,65.0,76.0,43.0
2,Aero,#7CB9E8,124,185,232,206.0,70.0,70.0
3,Aero blue,#C9FFE5,201,255,229,151.0,100.0,89.0
4,African violet,#B284BE,178,132,190,288.0,31.0,63.0
...,...,...,...,...,...,...,...,...
1293,Yellow rose,#FFF000,255,240,0,56.0,100.0,50.0
1294,Yellow Sunshine,#FFF700,255,247,0,58.0,100.0,50.0
1295,Zaffre,#0014A8,0,20,168,233.0,100.0,33.0
1296,Zinnwaldite brown,#2C1608,44,22,8,23.0,69.0,10.0


In [115]:
color_names_2 = [name.lower().strip() for name in color_csv2['Name']]

print (len(color_names_2))

color_csv2['new_name'] = color_names_2

1298


In [116]:
main_csv2_names, main_csv2_count = simple_match(color_names_2, color_names)
main_csv2_count

182

In [117]:
modified_main_data = fuzzy_match(main_csv2_names, color_names, color_csv2, modified_main_data)

blue magenta violet || blue-magenta violet
blue violet || blue-violet
burlwood || burlywood
dark seagreen || dark sea green
dark slate grey || dark slate gray
dolphin grey || dolphin gray
go green! || go green
granite grey || granite gray
green pigment || green (pigment)
green yellow || green-yellow
heliotrope grey || heliotrope gray
lavender grey || lavender gray
magenta dye || magenta (dye)
magenta pink || magenta-pink
medium red violet || medium red-violet
medium violet red || medium violet-red
orange yellow || orange-yellow
pale red violet || pale red-violet
pale violet red || pale violet-red
pastel grey || pastel gray
peach orange || peach-orange
peach yellow || peach-yellow
pink orange || pink-orange
plum purple || plump purple
quicksilver || quick silver
red pigment || red (pigment)
robin's egg blue || robin egg blue
rubber || ruber
sand brown || sandy brown
seafoam green || sea foam green
spanish grey || spanish gray
storm cloud || stormcloud
terracotta || terra cotta
timber wo

In [118]:
len(modified_main_data['colors'])

25829

In [119]:
color_names = [color['name'].lower().strip() for color in modified_main_data['colors']]
print (len(color_names))

25829


---
## Read csv 3 data file

In [120]:
color_csv3 = pd.read_csv('./sample_data/wikipedia_x11_colors.csv')
color_csv3

Unnamed: 0,Name,Hex (24 bit),Red (8 bit),Green (8 bit),Blue (8 bit),Hue (degrees),HSL.S (%),"HSL.L (%), HSV.S (%), HSV.V (%)"
0,"Blue, Alice !Alice Blue",#F0F8FF,240,248,255,208.0,100.0,97.0
1,"White, Antique !Antique White",#FAEBD7,250,235,215,34.0,78.0,91.0
2,Aqua !Aqua,#00FFFF,0,255,255,180.0,100.0,50.0
3,Aquamarine !Aquamarine,#7FFFD4,127,255,212,160.0,100.0,75.0
4,Azure !Azure,#F0FFFF,240,255,255,180.0,100.0,97.0
...,...,...,...,...,...,...,...,...
140,Wheat !Wheat,#F5DEB3,245,222,179,39.0,77.0,83.0
141,White !White,#FFFFFF,255,255,255,0.0,0.0,100.0
142,WhiteSmoke !White Smoke,#F5F5F5,245,245,245,0.0,0.0,96.0
143,Yellow !Yellow,#FFFF00,255,255,0,60.0,100.0,50.0


In [121]:
color_names_3 = [name.split("!")[1].strip().lower() for name in color_csv3['Name']]

color_names_3[:5]

['alice blue', 'antique white', 'aqua', 'aquamarine', 'azure']

In [122]:
color_csv3['new_name'] = color_names_3

In [123]:
main_csv3_names, main_csv3_count = simple_match(color_names_3, color_names)
main_csv3_count

9

In [124]:
modified_main_data = fuzzy_match(main_csv3_names, color_names, color_csv3, modified_main_data)

burlwood || burlywood
dark seagreen || dark sea green
dark slate grey || dark slate gray
sand brown || sandy brown


In [125]:
len(modified_main_data['colors'])

25834

In [126]:
color_names = [color['name'].lower().strip() for color in modified_main_data['colors']]
print (len(color_names))

25834


## Read csv4 data file

In [127]:
color_csv4 = pd.read_csv('./sample_data/colors4.csv')
color_csv4

Unnamed: 0,Name1,Name2,Hex,Red (8 bit),Green (8 bit),Blue (8 bit)
0,air_force_blue_raf,Air Force Blue (Raf),#5d8aa8,93,138,168
1,air_force_blue_usaf,Air Force Blue (Usaf),#00308f,0,48,143
2,air_superiority_blue,Air Superiority Blue,#72a0c1,114,160,193
3,alabama_crimson,Alabama Crimson,#a32638,163,38,56
4,alice_blue,Alice Blue,#f0f8ff,240,248,255
...,...,...,...,...,...,...
860,yellow_orange,Yellow Orange,#ffae42,255,174,66
861,yellow_process,Yellow (Process),#ffef00,255,239,0
862,yellow_ryb,Yellow (Ryb),#fefe33,254,254,51
863,zaffre,Zaffre,#0014a8,0,20,168


In [128]:
color_csv4.columns

Index(['Name1', 'Name2', 'Hex', 'Red (8 bit)', 'Green (8 bit)',
       'Blue (8 bit)'],
      dtype='object')

In [129]:
color_names_4 = [name.strip().lower() for name in color_csv4['Name2']]

color_names_4[:5]

['air force blue (raf)',
 'air force blue (usaf)',
 'air superiority blue',
 'alabama crimson',
 'alice blue']

In [130]:
color_csv4['new_name'] = color_names_4

In [131]:
main_csv4_names, main_csv4_count = simple_match(color_names_4, color_names)
main_csv4_count

42

In [132]:
modified_main_data = fuzzy_match(main_csv4_names, color_names, color_csv4, modified_main_data)

blue violet || blue-violet
burlwood || burlywood
dark seagreen || dark sea green
dark slate grey || dark slate gray
green pigment || green (pigment)
green yellow || green-yellow
indigo dye || indigo (dye)
lavender grey || lavender gray
magenta dye || magenta (dye)
medium red violet || medium red-violet
medium violet red || medium violet-red
pale red violet || pale red-violet
pale violet red || pale violet-red
pastel grey || pastel gray
peach orange || peach-orange
peach yellow || peach-yellow
pink orange || pink-orange
red pigment || red (pigment)
robin's egg blue || robin egg blue
sand brown || sandy brown
storm cloud || stormcloud
terracotta || terra cotta
timber wolf || timberwolf
tropical rainforest || tropical rain forest
vermilion cinnabar || vermilion (cinnabar)
violet blue || violet-blue
yellow green || yellow-green


In [133]:
len(modified_main_data['colors'])

25849

In [137]:
color_names = [color['name'].lower().strip() for color in modified_main_data['colors']]
print (len(color_names))

25849


### Covert the json data to csv

In [135]:
test_dict = {'name': 'mini', 'eats': 'chocolate'}

test_dict.get('name2') is None

True

In [140]:
data_to_write = {
    'color_name': [],
    'preprocessed_name': [],
    'R': [],
    'G': [],
    'B': [],
    'hex': []
}

for idx, color_data in enumerate(modified_main_data['colors']):
    data_to_write['preprocessed_name'].append(color_names[idx])
    data_to_write['color_name'].append(color_data['name'])
    rgb_combo = color_data['rgb']
    data_to_write['R'].append(rgb_combo['r'])
    data_to_write['G'].append(rgb_combo['g'])
    data_to_write['B'].append(rgb_combo['b'])

    if color_data.get('hex') is not None:
        data_to_write['hex'].append(color_data['hex'])
    else:
        data_to_write['hex'].append(None)

print (len(data_to_write['color_name']), len(modified_main_data['colors']))

25849 25849


In [141]:
pd.DataFrame(data=data_to_write).to_csv("./sample_data/color_names_consolidated.csv")