# Web Scrapping for Keyboards

## I will web scrape from Skroutz.gr website. It is a greek online shop website, similar to amazon. I have chosen this website because it's pretty.

### Import libraries for http request and BeautifulSoup

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import io

### To scrape a page we need the url of the page. In order to automate the craping script i created a function takes as arguments the website url, the starting url within the site and a file to write the results into.

In [2]:
def scrape_page(website_url, starting_url, file):
    # Create a request to the starting url page. Save the response.
    client = urlopen(website_url + starting_url)
    page_html = client.read()
    client.close()
    
    # Use BeautifulSoup to parse the html of the response.
    page_soup = soup(page_html, 'html.parser')
    
    # Get all the containers. Some have class='cf card' and some have class='cf card with-skus-slider'
    cf_cards = page_soup.findAll('li', {'class': 'cf card'})
    cf_cards_slider = page_soup.findAll('li', {'class': 'cf card with-skus-slider'})
    containers = cf_cards + cf_cards_slider

    # For all containers get the text from the appropriate html components.
    for container in containers:
        name = container.div.h2.a.text
        price = container.findAll('div', {'class': 'price react-component'})[0].div.a.text.replace(',','.')
        stores = container.findAll('span', {'class': 'shop-count'})[0].text
        rating = container.findAll('a', {'class': 'rating stars'})[0].span.text.replace(',','.')
        votes = container.findAll('a', {'class': 'rating stars'})[0].div.div.text
        type_ = container.findAll('p', {'class': 'specs'})[0].b.text
        connection = container.findAll('p', {'class': 'specs'})[0].findAll('b')[-1].text
        if container.findAll('a', {'class': 'more-colors-available'}) == []: colors = '0'
        else : colors = container.findAll('a', {'class': 'more-colors-available'})[0].text
        
        # Write the text into the file,
        f.write(name + ',' + price + ',' + stores + ',' + rating + ',' + votes + ',' + type_ + ',' + connection + ',' + colors + '\n')
         
    # To go to the next page of keyboards get the url of the next page from the component with the '>'. If there is not a next page terminate.
    next_page_arrow = page_soup.findAll('ol', {'class': 'react-component paginator cf'})[-1].findAll('li')[-1]
    if next_page_arrow.a == None : 
        f.close()
        return 0
    
    # Recursively call the scrapping function.
    next_url = next_page_arrow.a['href']
    scrape_page(website_url, next_url, file)

### Create a .csv file with a header of all the columns. Then call the scrapping function for the starting url.

In [3]:
filename = 'keyboards_raw.csv'
headers = 'Name,Price,Stores,Rating,Votes,Type,Connection,Colors\n'

f = io.open(filename, "w", encoding="utf-8")
f.write(headers)

scrape_page('https://www.skroutz.gr/', 'c/68/keyboards.html', f)

### To make a dataset out of this i will clean the data a bit.

In [4]:
import pandas as pd
import numpy as np

In [5]:
keyboards = pd.read_csv('keyboards_raw.csv')
keyboards.head()

Unnamed: 0,Name,Price,Stores,Rating,Votes,Type,Connection,Colors
0,Philips K264,7.50 €,σε 41 καταστήματα,4.7,6.0,Keyboard Only,USB,0
1,Philips SPT6324 Black,9.99 €,σε 20 καταστήματα,4.1,25.0,Keyboard & Mouse Set,USB,0
2,Philips SPT6224 Black,8.93 €,σε 37 καταστήματα,4.7,3.0,Keyboard & Mouse Set,USB,0
3,Logitech MX Keys,106.44 €,σε 8 καταστήματα,4.5,8.0,Keyboard Only,Bluetooth / USB,0
4,Powertech PT-677,3.99 €,σε 68 καταστήματα,2.7,7.0,Keyboard Only,USB,0


### Turn the strings to numbers, extract information from regular expressions and format some values.

In [6]:
pd.to_numeric(keyboards['Colors'])
pd.to_numeric(keyboards['Rating'], downcast='float')

keyboards['Votes'] = keyboards['Votes'].replace(np.nan, 0)
keyboards['Votes'] = keyboards['Votes'].astype('int')

keyboards['Price'] = keyboards['Price'].apply(lambda x: x.split('€')[0])
pd.to_numeric(keyboards['Price'])

import re
keyboards['Stores'] = keyboards['Stores'].apply(lambda x: re.findall(r'\d+', x))
keyboards['Stores'] = keyboards['Stores'].apply(lambda x: x[0] if x != [] else 0)

In [7]:
keyboards['Type'].unique()

array(['Keyboard Only', 'Keyboard & Mouse Set', 'Keyboard with Touchpad',
       '0'], dtype=object)

In [8]:
keyboards['Type'].replace('Keyboard & Mouse Set', 'Keyboard and Mouse Set', inplace=True)
keyboards['Type'].replace(0, 'No Info', inplace=True)

### Save the clean data in a .csv file.

In [9]:
keyboards.to_csv('keyboards.csv')

## Apparently gaming keyboards are on a different category. I wil do the same thing for gaming keyboards.

In [10]:
def scrape_page_gaming(website_url, starting_url, file):
    client = urlopen(website_url + starting_url)
    page_html = client.read()
    client.close()
    
    page_soup = soup(page_html, 'html.parser')
    
    cf_cards = page_soup.findAll('li', {'class': 'cf card'})
    cf_cards_slider = page_soup.findAll('li', {'class': 'cf card with-skus-slider'})
    containers = cf_cards + cf_cards_slider

    for container in containers:
        name = container.div.h2.a.text
        price = container.findAll('div', {'class': 'price react-component'})[0].div.a.text.replace(',', '.')
        stores = container.findAll('div', {'class': 'price react-component'})[0].div.findAll('span')[-1].text
        rating = container.findAll('a', {'class': 'rating stars'})[0].span.text.replace(',', '.')
        votes = container.findAll('a', {'class': 'rating stars'})[0].div.div.text
        if(container.findAll('p', {'class': 'specs'})[0].findAll('b') == []): switches = '-'
        else: switches = container.findAll('p', {'class': 'specs'})[0].findAll('b')[-1].text
        if container.findAll('a', {'class': 'more-colors-available'}) == []: colors = '0'
        else : colors = container.findAll('a', {'class': 'more-colors-available'})[0].text
                
        f.write(name + ',' + price + ',' + stores + ',' + rating + ',' + votes + ',' + switches + ',' + colors + '\n')
         
    next_page_arrow = page_soup.findAll('ol', {'class': 'react-component paginator cf'})[-1].findAll('li')[-1]
    if next_page_arrow.a == None : 
        f.close()
        return 0
    
    next_url = next_page_arrow.a['href']
    scrape_page_gaming(website_url, next_url, file)

In [11]:
filename = 'gaming_keyboards_raw.csv'
headers = 'Name,Price,Stores,Rating,Votes,Switches,Colors\n'

f = io.open(filename, "w", encoding="utf-8")
f.write(headers)

scrape_page_gaming('https://www.skroutz.gr/', 'c/1866/Gaming_pliktrologia.html', f)

In [12]:
gaming = pd.read_csv('gaming_keyboards_raw.csv')
gaming.head()

Unnamed: 0,Name,Price,Stores,Rating,Votes,Switches,Colors
0,Zeroground KB-2800G Satomi,53.99 €,σε 59 καταστήματα,4.6,27.0,Outemu Red,0
1,Razer Cynosa Lite Chroma,38.88 €,σε 40 καταστήματα,4.2,44.0,Μεμβράνης,0
2,Zeroground KB-2700G Sakimo,43.99 €,σε 64 καταστήματα,4.6,8.0,Outemu Blue,0
3,Zeroground KB-2400G Taigen v2.0,43.80 €,σε 69 καταστήματα,4.4,38.0,Custom Mechanical,0
4,Razer Blackwidow Chroma Elite (Razer Green),134.47 €,σε 35 καταστήματα,4.7,69.0,Razer Green,0


In [13]:
pd.to_numeric(gaming['Colors'])
pd.to_numeric(gaming['Rating'], downcast='float')

gaming['Votes'] = gaming['Votes'].replace(np.nan, 0)
gaming['Votes'] = gaming['Votes'].astype('int')

gaming['Price'] = gaming['Price'].apply(lambda x: x.split('€')[0])
pd.to_numeric(gaming['Price'])

gaming['Stores'] = gaming['Stores'].apply(lambda x: re.findall(r'\d+', x))
gaming['Stores'] = gaming['Stores'].apply(lambda x: x[0] if x != [] else 0)

In [14]:
gaming['Switches'].unique()

array(['Outemu Red', 'Μεμβράνης', 'Outemu Blue', 'Custom Mechanical',
       'Razer Green', 'Custom Blue', 'Razer Linear', 'Cherry MX Red',
       'Φωτιζόμενα Πλήκτρα', 'Razer Yellow', 'HyperX Red', 'Custom Brown',
       'Outemu Black', 'Razer Clicky', 'GX Blue', 'Kailh Red',
       'Kailh Speed Silver', 'Xinda Blue', 'Cherry MX Brown',
       'Romer-G Tactile', 'Razer Orange', 'Gateron Brown',
       'SteelSeries OmniPoint', 'Outemu Brown', 'Kailh Blue',
       'Kailh Brown', 'SteelSeries Hybrid Mechanical', 'GTX Red',
       'LK Libra Brown', 'Gateron Red', 'Cherry MX Red Low Profile',
       'Cherry MX Speed', 'Kailh Box White', 'Razer Analog Optical',
       'Titan Tactile', 'Cherry MX Blue', 'HyperX Aqua', 'Custom Silver',
       'LK Blue', '-', 'Σετ Πληκτρολόγιο & Ποντίκι', 'GL Clicky',
       'LK Libra Orange', 'Press', 'GL Linear', 'Kailh Short',
       'Full Size', 'Gamdias', 'Gateron Blue', 'Custom Red',
       'Cherry MX White', 'SteelSeries Red', 'GL Tactile',
       'Cher

In [15]:
gaming['Switches'].replace('Μεμβράνης', 'Membrane', inplace=True)
gaming['Switches'].replace('Φωτιζόμενα Πλήκτρα', 'Key Lights', inplace=True)
gaming['Switches'].replace('Σετ Πληκτρολόγιο & Ποντίκι', 'Keyboard and Mouse Set', inplace=True)
gaming['Switches'].replace('-', 'No Info', inplace=True)

In [16]:
gaming.to_csv('gaming_keyboards.csv')

### Now i'm going to merge the two dataframes.

In [17]:
all_keyboards = pd.concat([keyboards, gaming], ignore_index=True)

In [18]:
all_keyboards.head()

Unnamed: 0,Name,Price,Stores,Rating,Votes,Type,Connection,Colors,Switches
0,Philips K264,7.5,41,4.7,6,Keyboard Only,USB,0,
1,Philips SPT6324 Black,9.99,20,4.1,25,Keyboard and Mouse Set,USB,0,
2,Philips SPT6224 Black,8.93,37,4.7,3,Keyboard and Mouse Set,USB,0,
3,Logitech MX Keys,106.44,8,4.5,8,Keyboard Only,Bluetooth / USB,0,
4,Powertech PT-677,3.99,68,2.7,7,Keyboard Only,USB,0,


### Save the dataframe into a .csv file.

In [19]:
all_keyboards.to_csv('all_keyboards.csv')