##### Accessing the Digital Store web page

In [1]:
import numpy as np
import pandas as pd
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
digital_store_url = 'https://www.digitalstore.co.ke/'

In [4]:
site_response = requests.get(digital_store_url)

In [5]:
site_response.status_code

200

In [6]:
site_contents = site_response.text

In [7]:
site_doc = BeautifulSoup(site_contents, 'html.parser')

##### Lets fetch the category names

In [8]:
span_tags = site_doc.find_all('span', class_ = "collection-item__title text--strong")

In [9]:
category_tags = span_tags

In [10]:
category_tags[3].text

'Photography \n\n'

##### Lets fetch the category links

In [11]:
collection_index = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 # These are the indices used in the class 'a' that contains the tags.
link_tags = site_doc.find_all('a', {'data-collection-index': collection_index})

In [12]:
link_tags[0]['href']

'/collections/phones-tablets'

##### List of category titles

In [13]:
category_titles = []

for tag in category_tags:
    category_titles.append(tag.text.strip())
print(category_titles)

['Smartphones', 'Computers', 'Computer Accessories & Components', 'Photography', 'Printers & Inks', 'Home Entertainment', 'Networking & Telecommunications', 'Electricals & Appliances', 'Conferencing Equipment', 'Software', "Collection's name"]


##### List of category links

In [14]:
category_links = []
base_link = 'https://www.digitalstore.co.ke'
for tag in link_tags:
    category_links.append(base_link + tag['href'])
print(category_links)

['https://www.digitalstore.co.ke/collections/phones-tablets', 'https://www.digitalstore.co.ke/collections/desktops-monitors', 'https://www.digitalstore.co.ke/collections/computer-accessories-components', 'https://www.digitalstore.co.ke/collections/cameras', 'https://www.digitalstore.co.ke/collections/https-digital-store-kenya-myshopify-com-admin-products-query-printer', 'https://www.digitalstore.co.ke/collections/tvs', 'https://www.digitalstore.co.ke/collections/networking', 'https://www.digitalstore.co.ke/collections/electricals', 'https://www.digitalstore.co.ke/collections/conferencing-equipments', 'https://www.digitalstore.co.ke/collections/software', 'https://www.digitalstore.co.ke#']


##### Lets create a dataframe

In [15]:
categories_dict = {
    'name': category_titles,
    'link': category_links
}

In [16]:
categories_df = pd.DataFrame(categories_dict)
categories_df[:2] # Returns the first 2 rows of the categories_df

Unnamed: 0,name,link
0,Smartphones,https://www.digitalstore.co.ke/collections/pho...
1,Computers,https://www.digitalstore.co.ke/collections/des...


##### Create a csv from the dataframe

In [17]:
categories_df.to_csv('categories_table', index= None)  # Passing index= None removes the indexing that is created with the table.

##### Accessing the categories table

In [18]:
category_urls = pd.read_csv('categories_table')
category_urls

Unnamed: 0,name,link
0,Smartphones,https://www.digitalstore.co.ke/collections/pho...
1,Computers,https://www.digitalstore.co.ke/collections/des...
2,Computer Accessories & Components,https://www.digitalstore.co.ke/collections/com...
3,Photography,https://www.digitalstore.co.ke/collections/cam...
4,Printers & Inks,https://www.digitalstore.co.ke/collections/htt...
5,Home Entertainment,https://www.digitalstore.co.ke/collections/tvs
6,Networking & Telecommunications,https://www.digitalstore.co.ke/collections/net...
7,Electricals & Appliances,https://www.digitalstore.co.ke/collections/ele...
8,Conferencing Equipment,https://www.digitalstore.co.ke/collections/con...
9,Software,https://www.digitalstore.co.ke/collections/sof...


#### Now lets fetch data from the first category(Smartphones)

In [19]:
category_links = category_urls['link'].tolist() # Converts the link column to a list that can be referenced.
category_links[:3]

['https://www.digitalstore.co.ke/collections/phones-tablets',
 'https://www.digitalstore.co.ke/collections/desktops-monitors',
 'https://www.digitalstore.co.ke/collections/computer-accessories-components']

We will develop a function named 'smartphone_csv' which takes 'category links' and 'number of pages' within a category as input parameters. This function will generate a table comprising 'Vendor Names', 'Product Name and Description', and 'Product Price'. Finally, it will return a data frame resembling the constructed table.

In [None]:
def smartphone_csv(url, pages):                # Smartphones url and number of pages are given as inputs.
    url_li = []                                # This list will contain all the links to the pages in the smartphones category.
    if pages > 1:
        for i in range(1, pages + 1):
            url_li.append(url + '?page=' + str(i))          # Creates a list of urls if the category only has multiple pages.
    else:
        url_li.append(url)                                  # Creates a list of the url if the category only has 1 page.

    # Lists to store the collected data from all pages:
    final_vendors_names = []
    final_products_names = []
    final_products_prices = []


    # Iterating through all the pages in the smartphones category to get the page contents:
    for urlli in url_li:
        index_of_equal_sign = urlli.index('=')            # Define character (=) as a positional index that helps us extract the page numbers.
        print('Page ' + urlli[index_of_equal_sign + 1:])                 # Tells us the page number the code is iterating through.
        response = requests.get(urlli)
        print(response.status_code)                 # Outputs '200' to show that the code successfully retrieved the data.
    
        page_contents = response.text
        
        doc = BeautifulSoup(page_contents, 'html.parser')
        
        # Lets fetch the vendor's names:
        vender_a_tags = doc.find_all('a', class_ = "product-item__vendor link")
        
        # Lets fetch the product's name and description: 
        products_a_tags = doc.find_all('a', class_ = "product-item__title text--strong link")
        
        # Lets fetch the product's price:
        classes = "price price--highlight"
        products_span_tags = doc.find_all('span', {'class': classes})
    
        # List of vendors names:
        vendors_names = []
    
        for vname in vender_a_tags:
            vendors_names.append(vname.text)
            
        final_vendors_names.extend(vendors_names)         # Combines the list of venders from all the pages.
        
    
        # List of product price:
        products_price = []
        for pprice in products_span_tags:
            products_price.append(pprice.text.strip())         # .strip() removes spaces and any character adjacent to the spaces.
                
        final_products_prices.extend(products_price)        # combines the list of product prices from all the pages.
        
    
        # List of product names and description:
        products_names = []
        
        for pname in products_a_tags:
            products_names.append(pname.text)

        final_products_names.extend(products_names)         # combines the list of product names from all the pages
        

        # Lets find out if the data we are getting have equal lengths. This also helps check for missing values in every page:
        if len(vendors_names) == len(products_price) == len(products_names):
            message_1 = 'The lists in this page have equal lengths. Proceed to create a table!!!!'
        else:
            message_1 = 'Oops, check the length of the elements in page ' + str(i) + '!'
        print(message_1)
        print("\n")
        
    # Printing the combined lists from all page:
    print("\n", 'Printing the final vendors list!!')
    print("\n",final_vendors_names)
    print("\n", 'Printing the final products list!!')
    print("\n",final_products_names)
    print("\n", 'Printing the final prices list!!')
    print("\n",final_products_prices)

    # Creating a dictionary containing the all the final lists that we've created:
    print("\n", 'Printing the final dictionary!!') 
    smartphones_dict = {
    'vendor': final_vendors_names,
    'product name and description':final_products_names,
    'price': final_products_prices
    }
    smartphones_df = pd.DataFrame(smartphones_dict)          # Converts the smartphones dictionary into a dataframe.

    # Creating the final table:
    smartphones_df.to_csv('smartphones_table', index= None)       # Converts smartphones_df dataframe into a table that is created in our directory as smartphones_table.
    
    return smartphones_df               # Returns the final data that we've scrapped from the smartphones category.

In [21]:
# This is a call function that takes in category links and number of pages as inputs and returns the final dataframe as well as a csv table. 
smartphone_csv(category_links[0], 13)

Page 1
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 2
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 3
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 4
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 5
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 6
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 7
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 8
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 9
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 10
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 11
200
The lists in this page have equal lengths. Proceed to create a table!!!!


Page 12
200
The lists in this page have equal length

Unnamed: 0,vendor,product name and description,price
0,Apple,"Apple iPhone 14 Pro Max Smartphone - 6GB RAM, ...","Sale priceKSh189,999.00"
1,BNSL,"BSNL Penta P40 Pro Tablet - Dual SIM, 4GB RAM ...","Sale priceKSh12,999.00"
2,Xiaomi,Xiaomi Redmi Note 11 Smart Phone 4GB 128GB 6.4...,"Sale priceKSh20,499.00"
3,Tecno,Tecno spark 3 pro KB8 Smartphone- 2GB RAM + 32...,"Sale priceKSh11,499.00"
4,Tecno,"Tecno Spark 7 Pro Smartphone 4GB RAM, 128 GB S...","Sale priceKSh14,499.00"
...,...,...,...
291,Samsung,Samsung Galaxy A30s Smartphone-64GB ROM + 4GB ...,"Sale priceKSh23,999.00"
292,Samsung,"SAMSUNG GALAXY A20s Smartphone- 32GB + 3GB, 4G...","Sale priceKSh17,499.00"
293,Huawei,"Huawei CP84 Type C 40W Charger, Fast charging","Sale priceKSh2,599.00"
294,Huawei,"Huawei MediaPad T3 10 Tablet: 9.6"" inch","Sale priceKSh22,499.00"


The above call function can recieve any category link from the 'category_links' dataframe using indexing([ int ]).
When a category only has one page, you should use 'None' as the number of pages, for example; 

smartphone_csv(category_links[ int ], 'None')

#### Accessing the smartphones_table

In [22]:
smartphones_df = pd.read_csv('smartphones_table')
smartphones_df

Unnamed: 0,vendor,product name and description,price
0,Apple,"Apple iPhone 14 Pro Max Smartphone - 6GB RAM, ...","Sale priceKSh189,999.00"
1,BNSL,"BSNL Penta P40 Pro Tablet - Dual SIM, 4GB RAM ...","Sale priceKSh12,999.00"
2,Xiaomi,Xiaomi Redmi Note 11 Smart Phone 4GB 128GB 6.4...,"Sale priceKSh20,499.00"
3,Tecno,Tecno spark 3 pro KB8 Smartphone- 2GB RAM + 32...,"Sale priceKSh11,499.00"
4,Tecno,"Tecno Spark 7 Pro Smartphone 4GB RAM, 128 GB S...","Sale priceKSh14,499.00"
...,...,...,...
291,Samsung,Samsung Galaxy A30s Smartphone-64GB ROM + 4GB ...,"Sale priceKSh23,999.00"
292,Samsung,"SAMSUNG GALAXY A20s Smartphone- 32GB + 3GB, 4G...","Sale priceKSh17,499.00"
293,Huawei,"Huawei CP84 Type C 40W Charger, Fast charging","Sale priceKSh2,599.00"
294,Huawei,"Huawei MediaPad T3 10 Tablet: 9.6"" inch","Sale priceKSh22,499.00"
