In [1]:
import pandas as pd
import numpy as np
import json
import re

In [2]:
raw_data = pd.read_json('raw_data.json', orient='index')
raw_data.dropna(subset=['name'], inplace=True)
raw_data.fillna(value=np.nan, inplace=True)
raw_data.drop(columns=['details'], inplace=True)

In [3]:
raw_data

Unnamed: 0,name,price,sex,jewel_type,brand,material,color,jewel_weight,rocks,rock_details,dimensions,chain_carat,chain_length,diameter
0,Σετ Κολιέ & Σκουλαρίκια Gloria Hope Silver / P...,75,Women,Necklaces,Gloria Hope,Κράμα Μετάλλων,White,,,,,,,
1,Σετ Κολιέ & Σκουλαρίκια Gloria Hope Silver / P...,109,Women,Necklaces,Gloria Hope,Κράμα Μετάλλων,White,,,,,,,
2,Σετ Κολιέ & Σκουλαρίκια Gloria Hope Silver / P...,99,Women,Necklaces,Gloria Hope,Κράμα Μετάλλων,White,,,,,,,
3,Σετ Κολιέ & Σκουλαρίκια Gloria Hope Silver,69,Women,Necklaces,Gloria Hope,Silver,White,,,,,,,
4,Κολιέ δάκρυ Gloria Hope Silver,69,Women,Necklaces,Gloria Hope,Κράμα Μετάλλων,White,,,,Μήκος: 6cm,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8609,Βέρα γάμου,350,Women,Wedding rings,Haritidis,14 carat gold,Gold,5.0,,,Φάρδος 5mm,,,
8610,βέρα γάμου,396,,Wedding rings,Haritidis,18 carat gold,White & Gold,5.4,Diamond,,,,,
8611,Βέρα γάμου,235,Unisex,Wedding rings,Haritidis,14 carat gold,White & Rose,2.7,,,Φάρδος 3mm,,,
8612,Βέρα γάμου,390,Unisex,Wedding rings,Haritidis,14 carat gold,Gold,4.8,,,Φάρδος 4.5mm,,,


In [4]:
from googletrans import Translator
trans = Translator()

trans.translate('βροχή',src='el').text

'rain'

In [5]:
def is_ascii(s: str) -> bool: return all(ord(c) < 128 for c in s)

In [54]:
all_names = raw_data['name'].unique()

all_words = {}
for name in all_names:
    words = name.split(' ')
    for word in words:
        if word not in all_words and not is_ascii(word):
            all_words[word] = trans.translate(word, src='el').text

In [7]:
test_name = 'Σκουλαρίκια με σμαράγδι'
glossary = {
    'Σκουλαρίκια': 'Earrings',
    'με': 'with',
    'σμαράγδι': 'emerald'
}

new_name = ' '.join([glossary[word] for word in test_name.split(' ')])
print(test_name, new_name)

Σκουλαρίκια με σμαράγδι Earrings with emerald


In [8]:
def translate_name(greek_name: str) -> str:    
    new_name = greek_name
    for word in new_name.split(' '):
        if word in all_words: 
            new_name = new_name.replace(word, all_words[word])            
    return new_name

In [32]:
raw_data['name'] = raw_data['name'].apply(lambda row: translate_name(row))

In [33]:
raw_data['name']

0       Set Necklace & Earrings Gloria Hope Silver / P...
1       Set Necklace & Earrings Gloria Hope Silver / P...
2       Set Necklace & Earrings Gloria Hope Silver / P...
3              Set Necklace & Earrings Gloria Hope Silver
4                        Necklace tear Gloria Hope Silver
                              ...                        
8609                                 Wedding ring wedding
8610                                 wedding ring wedding
8611                                 Wedding ring wedding
8612                                 Wedding ring wedding
8613                                         Stavros male
Name: name, Length: 8613, dtype: object

In [11]:
for x in raw_data['material'].unique(): print(x)

Κράμα Μετάλλων
Silver
18 carat gold
Stainless Steel
14 carat gold
9 carat gold
Brass
nan
Wood


In [12]:
raw_data['material'].replace(to_replace='Κράμα Μετάλλων', value='Metal Alloy', inplace=True)
raw_data['material'].replace(to_replace='18 carat gold', value='Gold 18ct.', inplace=True)
raw_data['material'].replace(to_replace='14 carat gold', value='Gold 14ct.', inplace=True)
raw_data['material'].replace(to_replace='9 carat gold', value='Gold 9ct.', inplace=True)

In [62]:
def process_dimensions(dim: str) -> str:
    if dim is np.nan: return dim
    
    dim = dim.replace(',', '.')
    dim = dim.replace(' x ', 'x')
    dim = dim.replace(':', '')
    
    dim = dim.replace('κρίκων', 'of rings')
    dim = dim.replace('καρπό', 'wrist')
    dim = dim.replace('παραμάνας', 'of pin')
    dim = dim.replace('κύκλων', 'of circles')
    dim = dim.replace('Μήκος', 'Length')
    
    dim = translate_name(dim)
    
    dim = dim.replace('x', ' x ')
    dim = dim.replace(':', ': ')
    
    return dim

In [76]:
all_dimensions = raw_data['dimensions'].unique()[1:]

for dimensions in all_dimensions:
    words = dimensions.split(' ')
    for word in words:
        if word not in all_words and not is_ascii(word):
            all_words[word] = trans.translate(word, src='el').text

In [77]:
raw_data['dimensions'] = raw_data['dimensions'].apply(lambda row: process_dimensions(row))

In [78]:
raw_data['dimensions']

0                  NaN
1                  NaN
2                  NaN
3                  NaN
4           Length 6cm
             ...      
8609         Width 5mm
8610               NaN
8611         Width 3mm
8612       Width 4.5mm
8613    25.6 x 18.9 mm
Name: dimensions, Length: 8613, dtype: object

In [71]:
all_rock_details = raw_data['rock_details'].unique()[1:]

for rock_details in all_rock_details:
    words = rock_details.split(' ')
    for word in words:
        if word not in all_words and not is_ascii(word):
            try: all_words[word] = trans.translate(word, src='el').text; print(word)
            except IndexError: continue

Ιολίτης
Μαργαριταριού
λευκά


In [80]:
all_words['Ιολίτης'] = 'Iolite'
all_words['Μαργαριταριού'] = 'Pearl'
all_words['λευκά'] = 'white'
all_words['διαμάντια'] = 'diamonds'

In [81]:
raw_data['rock_details'] = raw_data['rock_details'].apply(lambda row: process_dimensions(row))

In [82]:
with open('all_words.json', 'w') as fp: json.dump(all_words, fp)

In [83]:
raw_data.to_csv('jewelry.csv', index=False)