# Manipulate the scraped data

## Loading part

In [1]:
# Import relevant libraries
import pandas as pd
import requests
import shutil

from IPython.display import display, HTML
display(HTML("<style>.container { width:78% !important; }</style>"))

In [2]:
# Load scraped data
df = pd.read_csv('baklava_scrape.csv')
df.head()

Unnamed: 0,baklava_name,baklava_size,baklava_price,baklava_image_url
0,Pistachio-Walnut Baklava Assortment,XL Metal Box),"\n€51,71\n",https://online.hafizmustafa.com/Uploads/UrunRe...
1,Premium Baklava Assortment,L Metal Box ),"\n€45,96\n",https://online.hafizmustafa.com/Uploads/UrunRe...
2,Pistachio-Walnut Baklava Assortment,L Metal Box),"\n€42,52\n",https://online.hafizmustafa.com/Uploads/UrunRe...
3,Pistachio Baklava Assortment,XL Metal Box),"\n€57,45\n",https://online.hafizmustafa.com/Uploads/UrunRe...
4,Pistachio-Walnut Baklava Assortment,S Metal Box),"\n€21,83\n",https://online.hafizmustafa.com/Uploads/UrunRe...


## Convert from url to jpg image on the repository

In [3]:
# Convert from url to jpg image on computer
for url in df['baklava_image_url']:
    img_data = requests.get(url).content
    image_name = url.split("thumb/")[1]
    with open(image_name, 'wb') as handler:
        handler.write(img_data)
        current_dir = image_name
        #move_to = f'/Users/oguzhangul/Desktop/image_folder/{image_name}'
        move_to = f'./imagefolder/{image_name}'
        shutil.move(current_dir,move_to)
    

## Add another feature containing the path to images on repository

In [4]:
# Add all path for the image on the repository
base = '/imagefolder/'
baklava_path = []
for url_jpg in df['baklava_image_url']:
    baklava_path.append(base + url_jpg.split('thumb/')[1])

df['baklava_image_path'] = baklava_path

In [5]:
# See the new dataframe
df.head(2)

Unnamed: 0,baklava_name,baklava_size,baklava_price,baklava_image_url,baklava_image_path
0,Pistachio-Walnut Baklava Assortment,XL Metal Box),"\n€51,71\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...
1,Premium Baklava Assortment,L Metal Box ),"\n€45,96\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/premium-karisik-baklava-l-kutu-47...


## Cleaning and adding relevant features

### Adding premium feature

In [143]:
# Checkpoint for not restarting entire notebook
df_clean = df.copy()

# Some of the baklava are Premium. Those can have their own feature called premium 
indexes_premium = df_clean[df_clean['baklava_name'].str.contains('Premium')].index
indexes_not_premium = df_clean[~df_clean['baklava_name'].str.contains('Premium')].index

#Adding the new feauture with Premium/ Not Premium
df_clean.loc[indexes_premium,'premium'] = 'Premium'
df_clean.loc[indexes_not_premium,'premium'] = 'Not Premium'

#Deleting all the places where Premium appears in name
df_clean["baklava_name"] = df_clean["baklava_name"].str.replace("Premium ", "")

### Cleaning baklava_size feature and handling missing values

In [144]:
# We can see many ")" appears in this feature which easily can be 
# removed by following line of code 
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace(')','')

# Here we would like see the observations nan appears 
df_clean[df_clean['baklava_size'].isna()]

# We actually see that the size appears in the name
df_clean.loc[df_clean['baklava_name'] == 'Ankara Walnut Baklava S Box','baklava_size'] = 'S Box'
df_clean.loc[df_clean['baklava_name'] == 'Baklava, Halep Kadayif with Pistachio L Box','baklava_size'] = 'L Box'

# Now we can remove the size from its name
df_clean['baklava_name'] = df_clean['baklava_name'].str.replace('S Box','')
df_clean['baklava_name'] = df_clean['baklava_name'].str.replace('L Box','')


  df_clean['baklava_size'] = df_clean['baklava_size'].str.replace(')','')


### Adding Tin feature and remove whitespace

In [145]:
# finding the indexes to material with Tin and not Tin
tin_index_in_name = df_clean[df_clean['baklava_name'] == 'HM1864 Mixed Special Metal Tin Box '].index
tin_index_in_size = df_clean[df_clean['baklava_size'].str.contains('Tin')].index
all_tin_indices = tin_index_in_name.union(tin_index_in_size)

all_index = df_clean.index
no_tin_indeces = all_index.difference(all_tin_indices)

# filling the indexes with corresponding package material
df_clean.loc[all_tin_indices,'tin'] = 'Tin'
df_clean.loc[no_tin_indeces,'tin'] = 'Not Tin'

# Remove whether the material is tin or not for all other features
df_clean['baklava_name'] = df_clean['baklava_name'].str.replace(' Metal Tin Box ','')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace(' - Tin Box','')

#Remove spaces on right and left end of all strings
df_clean['baklava_name'] = df_clean['baklava_name'].str.strip()
df_clean['baklava_size'] = df_clean['baklava_size'].str.strip()


### baklava_price

In [146]:
# Remove newlines from the feature.
df_clean['baklava_price'] = df_clean['baklava_price'].replace('\n','', regex=True)

# Remove euro sign € from the values and add it to feature name
df_clean['baklava_price'] = df_clean['baklava_price'].replace('€','', regex=True)

#Rename the feature name
df_clean = df_clean.rename(columns={'baklava_price': 'baklava_price_euro'})
df_clean
# Change data type to float
df_clean['baklava_price_euro'] = df_clean['baklava_price_euro'].replace(',','.', regex=True)
df_clean['baklava_price_euro'] = pd.to_numeric(df_clean['baklava_price_euro'],errors='coerce')

### Different naming on the size

In [147]:
# By following code, we can see that same things are named differently or on another language (turkish)
# Additionally we see on all images that the packages are the same with metal, so metal wil not be included
# and the tin ones are given on another feature, so we are just renaming those we have
print(df_clean['baklava_size'].value_counts())
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('XL Metal Box','XL Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('L Metal Box','L Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('M Metal Box','M Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('S Metal Box','S Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('XL Metal','XL Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('L Kutu','L Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('S Metal','S Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('S Kutu','S Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('Large Box','L Box')
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('S','S Box')

S Box           20
XL Metal Box    11
L Metal Box      6
XL Box           5
M Metal Box      4
L Box            4
S Metal Box      3
XL Metal         3
L Kutu           2
S Metal          1
S Kutu           1
Large Box        1
S                1
500 Gr           1
1 Kg             1
Name: baklava_size, dtype: int64


In [148]:
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace('S Box Box','S Box')
df_clean['baklava_size'].value_counts()

S Box     26
XL Box    19
L Box     13
M Box      4
500 Gr     1
1 Kg       1
Name: baklava_size, dtype: int64

## Data visualization

In [149]:
df_clean

Unnamed: 0,baklava_name,baklava_size,baklava_price_euro,baklava_image_url,baklava_image_path,premium,tin
0,Pistachio-Walnut Baklava Assortment,XL Box,51.71,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...,Not Premium,Not Tin
1,Baklava Assortment,L Box,45.96,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/premium-karisik-baklava-l-kutu-47...,Premium,Not Tin
2,Pistachio-Walnut Baklava Assortment,L Box,42.52,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...,Not Premium,Not Tin
3,Pistachio Baklava Assortment,XL Box,57.45,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-karisik-baklava-xl-kutu-...,Not Premium,Not Tin
4,Pistachio-Walnut Baklava Assortment,S Box,21.83,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...,Not Premium,Not Tin
...,...,...,...,...,...,...,...
59,Mixed Ottoman Kadayif,500 Gr,14.94,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/karisik-osmanli-kadayifi-500-gr-9...,Not Premium,Not Tin
60,Fıstıklı Osmanlı Kadayıfı,S Box,32.17,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-osmanli-kadayifi-s-kutu-...,Not Premium,Not Tin
61,Pistachio Halep Kadayif,1 Kg,22.98,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-halep-kadayifi-s-kutu-2d...,Not Premium,Not Tin
62,Halep Turkish Kadayif With Pistachio,L Box,40.22,https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-halep-l-kutu--c66e-4.jpg,Not Premium,Not Tin
