# Manipulate the scraped data

## Loading part

In [1]:
# Import relevant libraries
import pandas as pd
import requests
import shutil

from IPython.display import display, HTML
display(HTML("<style>.container { width:78% !important; }</style>"))

In [2]:
# Load scraped data
df = pd.read_csv('baklava_scrape.csv')
df.head()

Unnamed: 0,baklava_name,baklava_size,baklava_price,baklava_image_url
0,Pistachio-Walnut Baklava Assortment,XL Metal Box),"\n€51,71\n",https://online.hafizmustafa.com/Uploads/UrunRe...
1,Premium Baklava Assortment,L Metal Box ),"\n€45,96\n",https://online.hafizmustafa.com/Uploads/UrunRe...
2,Pistachio-Walnut Baklava Assortment,L Metal Box),"\n€42,52\n",https://online.hafizmustafa.com/Uploads/UrunRe...
3,Pistachio Baklava Assortment,XL Metal Box),"\n€57,45\n",https://online.hafizmustafa.com/Uploads/UrunRe...
4,Pistachio-Walnut Baklava Assortment,S Metal Box),"\n€21,83\n",https://online.hafizmustafa.com/Uploads/UrunRe...


## Convert from url to jpg image on the repository

In [3]:
# Convert from url to jpg image on computer
for url in df['baklava_image_url']:
    img_data = requests.get(url).content
    image_name = url.split("thumb/")[1]
    with open(image_name, 'wb') as handler:
        handler.write(img_data)
        current_dir = image_name
        #move_to = f'/Users/oguzhangul/Desktop/image_folder/{image_name}'
        move_to = f'./imagefolder/{image_name}'
        shutil.move(current_dir,move_to)
    

## Add another feature containing the path to images on repository

In [4]:
# Add all path for the image on the repository
base = '/imagefolder/'
baklava_path = []
for url_jpg in df['baklava_image_url']:
    baklava_path.append(base + url_jpg.split('thumb/')[1])

df['baklava_image_path'] = baklava_path

In [5]:
# See the new dataframe
df.head(2)

Unnamed: 0,baklava_name,baklava_size,baklava_price,baklava_image_url,baklava_image_path
0,Pistachio-Walnut Baklava Assortment,XL Metal Box),"\n€51,71\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...
1,Premium Baklava Assortment,L Metal Box ),"\n€45,96\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/premium-karisik-baklava-l-kutu-47...


## Cleaning and adding relevant features

### Adding premium feature

In [8]:
df

Unnamed: 0,baklava_name,baklava_size,baklava_price,baklava_image_url,baklava_image_path
0,Pistachio-Walnut Baklava Assortment,XL Metal Box),"\n€51,71\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...
1,Premium Baklava Assortment,L Metal Box ),"\n€45,96\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/premium-karisik-baklava-l-kutu-47...
2,Pistachio-Walnut Baklava Assortment,L Metal Box),"\n€42,52\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...
3,Pistachio Baklava Assortment,XL Metal Box),"\n€57,45\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-karisik-baklava-xl-kutu-...
4,Pistachio-Walnut Baklava Assortment,S Metal Box),"\n€21,83\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-cevizli-karisik-baklava-...
...,...,...,...,...,...
59,Mixed Ottoman Kadayif,500 Gr),"\n€14,94\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/karisik-osmanli-kadayifi-500-gr-9...
60,Fıstıklı Osmanlı Kadayıfı,S Box),"\n€32,17\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-osmanli-kadayifi-s-kutu-...
61,Pistachio Halep Kadayif,1 Kg),"\n€22,98\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-halep-kadayifi-s-kutu-2d...
62,Halep Turkish Kadayif With Pistachio,L Box),"\n€40,22\n",https://online.hafizmustafa.com/Uploads/UrunRe...,/imagefolder/fistikli-halep-l-kutu--c66e-4.jpg


In [7]:
# Checkpoint for not restarting entire notebook
df_clean = df.copy()

# Some of the baklava are Premium. Those can have their own feature called premium 
indexes_premium = df_clean[df_clean['baklava_name'].str.contains('Premium')].index
indexes_not_premium = df_clean[~df_clean['baklava_name'].str.contains('Premium')].index

#Adding the new feauture with Premium/ Not Premium
df_clean.loc[indexes_premium,'premium'] = 'Premium'
df_clean.loc[indexes_not_premium,'premium'] = 'Not Premium'

#Deleting all the places where Premium appears in name
df_clean["baklava_name"] = df_clean["baklava_name"].str.replace("Premium ", "")

### Cleaning baklava_size feature and handling missing values

In [10]:
# We can see many ")" appears in this feature which easily can be 
# removed by following line of code 
df_clean['baklava_size'] = df_clean['baklava_size'].str.replace(')','')

# Here we would like see the observations nan appears 
df_clean[df_clean['baklava_size'].isna()]

# We actually see that the size appears in the name
df_clean.loc[df_clean['baklava_name'] == 'Ankara Walnut Baklava S Box','baklava_size'] = 'S Box'
df_clean.loc[df_clean['baklava_name'] == 'Baklava, Halep Kadayif with Pistachio L Box','baklava_size'] = 'L Box'

# Now we can remove the size from its name
df_clean['baklava_name'] = df_clean['baklava_name'].str.replace('S Box','')
df_clean['baklava_name'] = df_clean['baklava_name'].str.replace('L Box','')


  df_clean['baklava_size'] = df_clean['baklava_size'].str.replace(')','')


### Adding Tin feature

In [48]:
#Checkpoint
df1 = df_clean.copy()

tin_index_in_name = df1[df1['baklava_name'] == 'HM1864 Mixed Special Metal Tin Box '].index
tin_index_in_size = df1[df1['baklava_size'].str.contains('Tin')].index
all_tin_indices = tin_index_in_name.union(tin_index_in_size)

all_index = df1.index
no_tin_indeces = all_index.difference(all_tin_indices)



Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 12, 13, 14, 15, 16, 17, 18, 19,
            20, 21, 22, 23, 24, 25, 26, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38,
            39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
            56, 57, 58, 59, 60, 61, 62, 63],
           dtype='int64')

In [27]:
#Checkpoint
df1 = df_clean.copy()

# It appears to the packages are made of Metal and a few are made of TIN
# This could also get it own feature 
df1

tin_index_in_name = df1[df1['baklava_name'] == 'HM1864 Mixed Special Metal Tin Box '].index
print(tin_index_in_name)
tin_index_in_size = df1[df1['baklava_size'].str.contains('Tin')].index
print(tin_index_in_size)
all_tin_indices = tin_index_in_name.union(tin_index_in_size)
print(all_tin_indices)

#NOT tin
not_tin_index_in_name = df1[~df1['baklava_name'].str.contains('HM1864 Mixed Special Metal Tin Box ')].index
print(not_tin_index_in_name)
not_tin_index_in_size = df1[~df1['baklava_size'].str.contains('Tin')].index
print(not_tin_index_in_size)
not_all_tin_indices = not_tin_index_in_name.union(not_tin_index_in_size)
print(not_all_tin_indices)

#We have found the indices above, and we are creating the feature here
df1.loc[all_tin_indices,'tin'] = 'Tin'
df1.loc[not_all_tin_indices,'tin'] = 'Not Tin'

# Now we would like to remove Tin from all places
df1['baklava_name'] = df1['baklava_name'].str.replace(' Metal Tin Box ','')


Int64Index([27], dtype='int64')
Int64Index([8, 10, 11, 29], dtype='int64')
Int64Index([8, 10, 11, 27, 29], dtype='int64')
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
            52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63],
           dtype='int64')
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 12, 13, 14, 15, 16, 17, 18, 19,
            20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37,
            38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
            55, 56, 57, 58, 59, 60, 61, 62, 63],
           dtype='int64')
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 

In [None]:
df1['tin'].value_counts()

In [None]:
df[df['baklava_name'] == 'HM1864 Mixed Special']

In [None]:
df['baklava_size'][:40]

### baklava_price