In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from user_agent import generate_user_agent

import re
import pandas as pd
import time 
from typing import *
from tqdm import tqdm
import os
import datetime

In [None]:
class GSMARENAScraper:
    def __init__(self, RATE_LIMIT:int = 20, autosave: bool = False, save_interval:int =20):
        '''
        Initializes the GSMARENAScraper with the specified rate limit, autosave option, and save interval.

        PARAMS
        -----
        RATE_LIMIT: int
            The maximum number of requests to send per second.
        autosave: bool
            Whether to automatically save the dataset at regular intervals.
        save_interval: int
            The number of requests to process before saving the dataset.
        '''
        self.dataset = pd.DataFrame({"manufacturer": [], 
                                    "phonename": [], 
                                    "releasedate": [], 
                                    "os": [], 
                                    "batsize": [], 
                                    "battype": [], 
                                    "scrsize": [], 
                                    "scrtype": [], 
                                    "nettech": [], 
                                    "chipset": [], 
                                    "cpu": [], 
                                    "gpu": [], 
                                    "internal": [], 
                                    "maincammodule": [], 
                                    "maincamvid": [], 
                                    "selfcammodule": [], 
                                    "selfcamvid": [], 
                                    "price": []})
        self.rate_limit = RATE_LIMIT

        # autosaver
        if autosave and not save_interval:
            raise ValueError("If autosave is enabled, save_interval must be specified")
        if autosave and save_interval <= 0:
            raise ValueError("If autosave is enabled, save_interval must be greater than 0")
        self.autosave_check = autosave
        if autosave:
            self.save_interval = save_interval
            self.index = 0
            self.timestart = datetime.datetime.now().strftime("%Y-%m-%d-%H%M")

        # initializes useragent
        usergAgent = generate_user_agent(device_type="desktop", os="win", navigator="chrome", platform="win")
        options = Options()
        options.add_argument(f'user-agent={usergAgent}')
        self.driver = webdriver.Chrome()
        self.driver.get("https://www.gsmarena.com/makers.php3")

        # get brand information and URLS
        brandList = self.driver.find_elements(By.XPATH, './/div[@class="st-text"]//tbody/tr//td')
        self.brandINFO = []
        for brand in brandList:
            brand_link = brand.find_element(By.XPATH, './/a').get_attribute('href')
            brandName, tot_devices = brand.find_element(By.XPATH, './/a').text.split("\n")
            tot_devices = int(re.sub(r'\D', '', tot_devices))
            self.brandINFO.append([brandName, tot_devices, brand_link])
        self.brandINFO = pd.DataFrame(self.brandINFO, columns=["manufacturer", "total_devices", "link"])
    
    def autosave(self):
        '''
        Saves the dataset to a CSV file at regular intervals.
        '''
        os.makedirs(f"TEMP/{self.timestart}", exist_ok=True)
        
        if self.index % self.save_interval == 0:
            self.dataset.to_csv(f"TEMP/{self.timestart}/{self.index}.csv", index=False)

    def scrape_content(self):
        '''
        Scrapes the URLs from whatever page you're showing.
        then scrapes the content from those URLs.
        '''
        content_URLs = []
        content_elements = self.driver.find_elements(By.XPATH, './/div[@id="review-body"]/div[@class="makers"]/ul/li')
        for element in content_elements:
            content_URLs.append(element.find_element(By.XPATH, './/a').get_attribute('href'))
        for url in tqdm(content_URLs, desc="Scraping phone"):
            time.sleep(self.rate_limit)
            self.driver.get(url)
            self.getphonespec()

            if self.autosave_check:
                self.index+=1
                self.autosave()

    def getphonespec(self):
        '''
        Scrapes the specifications of a phone from its detail page.
        '''
        phone_spec_box = self.driver.find_element(By.XPATH, './/div[@id="body"]/div[1]')  # Get phone specifications box

        try:
            phoneName = phone_spec_box.find_element(By.XPATH, './/h1[@class="specs-phone-name-title"]').text
        except:
            phoneName = "Na"

        try:
            releasedate = phone_spec_box.find_element(By.XPATH, './/span[@data-spec="released-hl"]').text.removeprefix("Released ")
        except:
            releasedate = "Na"

        try:
            os = phone_spec_box.find_element(By.XPATH, './/span[@data-spec="os-hl"]').text
        except:
            os = "Na"

        # battery
        try:
            batsize = phone_spec_box.find_element(By.XPATH, './/span[@data-spec="batsize-hl"]').text
        except:
            batsize = "Na"

        try:
            battype = phone_spec_box.find_element(By.XPATH, './/div[@data-spec="battype-hl"]').text
        except:
            battype = "Na"

        # screen
        try:
            scrsize = phone_spec_box.find_element(By.XPATH, './/div[@data-spec="displayres-hl"]').text.strip(" pixels")
        except:
            scrsize = "Na"

        try:
            scrtype = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="displaytype"]').text
        except:
            scrtype = "Na"

        try:
            nettech = phone_spec_box.find_element(By.XPATH, './/a[@data-spec="nettech"]').text
        except:
            nettech = "Na"

        # platform
        try:
            chipset = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="chipset"]').text
        except:
            chipset = "Na"

        try:
            cpu = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="cpu"]').text
        except:
            cpu = "Na"

        try:
            gpu = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="gpu"]').text
        except:
            gpu = "Na"

        try:
            internal = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="internalmemory"]').text
        except:
            internal = "Na"

        # main camera
        try:
            maincammodule = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="cam1modules"]').text
        except:
            maincammodule = "Na"

        try:
            maincamvid = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="cam1video"]').text
        except:
            maincamvid = "Na"

        # selfie camera
        try:
            selfcammodule = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="cam2modules"]').text
        except:
            selfcammodule = "Na"

        try:
            selfcamvid = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="cam2video"]').text
        except:
            selfcamvid = "Na"

        # price
        try:
            price = phone_spec_box.find_element(By.XPATH, './/td[@data-spec="price"]').text.strip("About ")
        except:
            price = "Na"

        self.dataset = pd.concat([self.dataset, pd.DataFrame({"manufacturer": [self.brandName],
                                                               "phonename": [phoneName],
                                                               "releasedate": [releasedate],
                                                               "os": [os],
                                                               "batsize": [batsize],
                                                               "battype": [battype],
                                                               "scrsize": [scrsize],
                                                               "scrtype": [scrtype],
                                                               "nettech": [nettech],
                                                               "chipset": [chipset],
                                                               "cpu": [cpu],
                                                               "gpu": [gpu],
                                                               "internal": [internal],
                                                               "maincammodule": [maincammodule],
                                                               "maincamvid": [maincamvid],
                                                               "selfcammodule": [selfcammodule],
                                                               "selfcamvid": [selfcamvid],
                                                               "price": [price]})], 
                                                               ignore_index=True)

    def brand_scrape(self, brandName):
        '''
        Scrapes all phone models for a given brand.
        '''
        URL = self.brandINFO[self.brandINFO["manufacturer"]== brandName]["link"].values[0]
        self.driver.get(URL)

        try:
            # This checks if the page navigation element is present. If there's one page on the brand.
            page_nav = self.driver.find_element(By.XPATH, './/div[@class="review-nav-v2"]//div[@class="nav-pages"]')
            temp = re.findall(r'\d+', page_nav.text)
            current_index, last_index = int(temp[0]), int(temp[-1])

            page_urlStructure = page_nav.find_elements(By.XPATH, './/a')[-1].get_attribute('href')
            PAGE_URLS = []
            for i in range(current_index+1, last_index + 1):
                PAGE_URLS.append(re.sub(r'p\d+', f'p{i}', page_urlStructure))
        except:
            PAGE_URLS = []

        self.brandName = self.driver.find_element(By.XPATH, './/h1["@class = article-info-name"]').text.split(" ")[0]
        print(f"PAGE 1/{len(PAGE_URLS)+1} for brand {brandName}")
        self.scrape_content()
        for page_url in PAGE_URLS:
            print(f"PAGE {PAGE_URLS.index(page_url)+2}/{len(PAGE_URLS)+1} for brand {brandName}")
            self.driver.get(page_url)
            self.scrape_content()

        os.makedirs("OUTPUT", exist_ok=True)
        self.dataset.to_csv(f"OUTPUT/{brandName}.csv", index=False)

    def scrapeALL(self):
        '''
        Scrapes all phone models for all brands.
        '''
        for brand in tqdm(self.brandINFO["manufacturer"], desc="Scraping all brands"):
            self.brand_scrape(brand)
        self.dataset.to_csv(f"OUTPUT/!GSMARENA-DATASET.csv", index=False)


In [34]:
session = GSMARENAScraper()

  return generate_navigator(os=os, navigator=navigator,


In [35]:
session.brandINFO["manufacturer"].values

array(['ACER', 'ALCATEL', 'ALLVIEW', 'AMAZON', 'AMOI', 'APPLE', 'ARCHOS',
       'ASUS', 'AT&T', 'BENEFON', 'BENQ', 'BENQ-SIEMENS', 'BIRD',
       'BLACKBERRY', 'BLACKVIEW', 'BLU', 'BOSCH', 'BQ', 'CASIO', 'CAT',
       'CELKON', 'CHEA', 'COOLPAD', 'CUBOT', 'DELL', 'DOOGEE', 'EMPORIA',
       'ENERGIZER', 'ERICSSON', 'ETEN', 'FAIRPHONE', 'FUJITSU SIEMENS',
       'GARMIN-ASUS', 'GIGABYTE', 'GIONEE', 'GOOGLE', 'HAIER', 'HMD',
       'HONOR', 'HP', 'HTC', 'HUAWEI', 'I-MATE', 'I-MOBILE', 'ICEMOBILE',
       'INFINIX', 'INNOSTREAM', 'INQ', 'INTEX', 'ITEL', 'JOLLA',
       'KARBONN', 'KYOCERA', 'LAVA', 'LEECO', 'LENOVO', 'LG', 'MAXON',
       'MAXWEST', 'MEIZU', 'MICROMAX', 'MICROSOFT', 'MITAC', 'MITSUBISHI',
       'MODU', 'MOTOROLA', 'MWG', 'NEC', 'NEONODE', 'NIU', 'NOKIA',
       'NOTHING', 'NVIDIA', 'O2', 'ONEPLUS', 'OPPO', 'ORANGE', 'OSCAL',
       'OUKITEL', 'PALM', 'PANASONIC', 'PANTECH', 'PARLA', 'PHILIPS',
       'PLUM', 'POSH', 'PRESTIGIO', 'QMOBILE', 'QTEK', 'RAZER', 'REALME',
   

In [36]:
session.brand_scrape("MEIZU")

PAGE 1/2 for brand MEIZU


Scraping phone: 100%|██████████| 50/50 [18:09<00:00, 21.79s/it]


PAGE 2/2 for brand MEIZU


Scraping phone: 100%|██████████| 36/36 [13:05<00:00, 21.82s/it]
