In [2]:
import requests as rq
from urllib.parse import quote
import pandas as pd
import re
import time
import random
import json


In [3]:
# SCRAPING THE IDEA, BLOCKED BY THE WEBSITE

In [4]:
class PhoneSpecsHandler:
    """
    A class that handles the retrieval of phone specifications using an API.

    Attributes:
        api (str): The URL of the API.
        headers (dict): The headers to be used for the requests.
        filename (str): The name of the file to save the retrieved data.

    Methods:
        get_phone_brands: Retrieves the available phone brands from the API.
        get_phone_models: Retrieves the phone models for a specific brand from the API.
        get_phone_specs: Retrieves the specifications of a specific phone from the API.
        update_all_data: Retrieves the specifications of all phones from the API.
        save_data: Saves the retrieved phone specifications to a JSON file.
    """

    def __init__(self, filename='phone_data.json'):
        self.api = 'https://phone-specs-api.vercel.app'
        self.headers = {'Content-Type': 'application/json',
                        'User-Agent': 'Mozilla/5.0'}
        self.filename = filename

        try:
            with open(self.filename, 'r') as f:
                self.data = json.load(f)
        except Exception as e:
            print(f'[Error: {e}]: Creating new file...')
            self.data = dict()

            # save the file
            self.save_data()


        

    def __get(self, url):
        """
        Retrieves the JSON response from the API.

        Args:
            url (str): The URL to be requested.

        Returns:
            dict: The JSON response from the API.
        """

        # throttle the requests to avoid getting blocked
        # add random delay between 0.5 and 2 seconds
        start = time.time()
        response = rq.get(url, headers=self.headers, timeout=10).json()
        end = time.time()

        duration = end - start
        if duration < 2:
            d = int((2 - duration)*1000)
            time.sleep(random.randint(d, d + 500)/1000)

        return response
    
    def __try_get(self, url):
        retries = 0
        while True:
            if retries == 5:
                print(f'[Error: {result["error"]}]: Retries exceeded. Skipping...')
                return None
            
            try:
                result = self.__get(url)

                if str(result['status']) == 'True':
                    return result
                elif 'Please provide a valid phone slug!' in result['error']:                
                    print(f'[Error: {result["error"]}]: Retrying in 5s...')
                    retries += 1
                    time.sleep(5)

            except Exception as e:
                print(f'[Error: {e.args[0]}]: Retrying in 5s...')
                retries += 1
                time.sleep(5)                


    def get_phone_brands(self):
        """
        Retrieves the available phone brands from the API.

        Returns:
            dict: The JSON response containing the available phone brands.
        """
        url = self.api + '/brands'
        response = self.__try_get(url)
        
        return response

    def get_phone_models(self, brand_name, brand_id):
        """
        Retrieves the phone models for a specific brand from the API.

        Args:
            brand_name (str): The name of the phone brand.
            brand_id (int): The ID of the phone brand.

        Returns:
            dict: The JSON response containing the phone models for the specified brand.
        """
        url = self.api + f'/brands/{quote(brand_name.lower().replace(" ", "_"))}-phones-{brand_id}'
        phones = self.__try_get(url)
        
        print(phones, url, sep='\n')

        last_page = int(phones['data']['last_page']) 

        print(f'last page: {last_page}')
        if last_page == 1:
            return phones

        for page in range(1, last_page):
            url_page = self.api + f'/brands/{quote(brand_name.lower().replace(" ", "_"))}-phones-f-{brand_id}-0-p{page+1}'
            response_page = self.__try_get(url_page)

            print(phones)

            phones['data']['phones'].extend(response_page['data']['phones'])

        phones['data'].pop('current_page')
        phones['data'].pop('last_page')

        return phones
    
    def get_phone_models(self, brand_json):
            """
            Retrieves the phone models for a given brand from the GSMArena API.

            Args:
                brand_json (dict): The JSON response containing the brand information.

            Returns:
                dict: The updated brand JSON response with the phone models.

            """

            phones = self.__try_get(brand_json['detail'])          

            # Use re.search to find the first match
            match = re.search(r"(.*?)-(\d+)", brand_json['brand_slug'])

            last_page = int(phones['data']['last_page']) 

            if match:
                # Extract the groups from the match
                first_part = match.group(1)  # Text before the last hyphen
                number = int(match.group(2))  # Number after the last hyphen

                print(f'Page 1 out of {last_page} for {brand_json["brand_name"]} is processing.')

                for page in range(1, last_page):
                    url_page = self.api + f'/brands/{quote(first_part)}-f-{number}-0-p{page+1}'
                    response_page = self.__try_get(url_page)

                    if response_page:
                        phones['data']['phones'].extend(response_page['data']['phones'])
                        print(f'Page {page+1} out of {last_page} done.')
                    else:
                        print(f'Page {page+1} out of {last_page} failed to process. Skipping...')


            phones['data'].pop('current_page')
            phones['data'].pop('last_page')

            return phones
    
    def get_phone_specs(self, phone_name):
        """
        Retrieves the specifications of a specific phone from the API.

        Args:
            phone_name (str): The name of the phone.

        Returns:
            dict: The JSON response containing the specifications of the specified phone.
        """
        url = self.api + f'/{quote(phone_name)}'
        response = self.__try_get(url)
        
        return response
    
    def get_result(self, phone_name):
        """
        Retrieves the search results for a specific phone from the API.

        Args:
            phone_name (str): The name of the phone to search for.

        Returns:
            dict: The JSON response containing the search results for the specified phone.
        """
        url = self.api + f'/search?query={quote(phone_name)}'
        response = self.__try_get(url)

        return response
    
    def update_all_data(self, forced=False):
        print('Phone specs scraping at 2s delay per request. Please wait...')

        if forced:
            self.data = dict()

        # brands = self.get_phone_brands()['data'][8:9]
        brands = self.get_phone_brands()['data']


        phone_sum =  sum([int(x) for x in [brand['device_count'] for brand in brands]])

        
        print(f'There are {phone_sum - self.count_data()} phones to be processed.\nNew phones catalog count: {phone_sum}\nOld phones catalog count: {self.count_data()}')

        if phone_sum-self.count_data() == 0:
            brands_len += 1
            print(f'[100% done]: No new phones to be processed. Done processing.')
            return self.data

        brands_len = len(brands)

        current_phone = 0
        total_time = 0
        
        for i, models in enumerate(brands):
            if self.data.get(models["brand_name"]) is None:
                self.data[models["brand_name"]] = {}

            if int(models['device_count']) == len(self.data[models["brand_name"]]):
                current_phone += int(models['device_count'])
                print(f'[{round(((current_phone)/phone_sum)*100, 4): 0.4f}% done; {current_phone}/{phone_sum} phones]: No new {models["brand_name"]} models to be processed. {models["brand_name"]} done processing, {brands_len - i} brands left.')
                continue

            phones = self.get_phone_models(models)['data']['phones']
            phones_len = len(phones)
            

            for j, phone in enumerate(phones):
                if phone['slug'] in self.data[models["brand_name"]].keys(): 
                    if self.data[models["brand_name"]][phone['slug']]['specifications']['Launch']['Status'][0].lower().find("coming soon") != -1:
                        print(f'[COMING SOON]: Trying to update {phone["phone_name"]}...')
                    else:
                        print(f'[{round(((current_phone := current_phone + 1)/phone_sum)*100, 4): 0.4f}% done; {j+1}/{phones_len} {models["brand_name"]} phones; {current_phone}/{phone_sum} phones]: {phone["phone_name"]} already exists. Skipping...')
                        continue


                # initial time
                start = time.time()
                
                specs = self.__try_get(phone['detail'])     
                if specs:        
                    self.data[models["brand_name"]][phone['slug']] = specs['data']
                    # clean all phones' specs data
                    self.data[models["brand_name"]][phone['slug']]['specifications'] = {j['title']: {k['key']: k['val'] for k in j['specs']} for j in  specs['data']['specifications']}

                    # end time
                    end = time.time()
                    remaining_time = ((total_time := total_time + end - start) / (j + 1)) * (phone_sum - (j + 1))
                    eta = time.time() + remaining_time

                    # convert hours to hours and minutes and seconds
                    print(f'[{round(((current_phone := current_phone + 1)/phone_sum)*100, 4): 0.4f}% done; ETA: {time.strftime("%I:%M:%S %p", time.localtime(eta))}, Remaining Time: {time.strftime("%H:%M:%S", time.gmtime(remaining_time))}; {j+1}/{phones_len} {models["brand_name"]} phones; {end-start:.3f}s; {current_phone}/{phone_sum} phones]: {phone["phone_name"]} done processing.')                
                else:
                    print(f'[{round(((current_phone := current_phone + 1)/phone_sum)*100, 4): 0.4f}% done; {j+1}/{phones_len} {models["brand_name"]} phones; {current_phone}/{phone_sum} phones]: {phone["phone_name"]} failed to process. Skipping...')


            print(f'[{round(((current_phone)/phone_sum)*100, 4): 0.4f}% done; {j+1}/{phones_len} {models["brand_name"]} phones; {current_phone}/{phone_sum} phones]: {models["brand_name"]} done processing, {brands_len - i} brands left.')

            # process/ normalize the data

        print(f'[100% done; {current_phone}/{phone_sum} phones]: Done processing.')


        return self.data
    

    def count_data(self):
        return sum([len(self.data[brand]) for brand in self.data])
            

    def save_data(self, filename = 'phone_data.json', forced_normalize = False):
        # open the previous file and store it to a variable

        __temp_data = dict()
        print('Loading the previous data...')
        with open(self.filename, 'r') as f:
            __temp_data = json.load(f)

        print('Processing the data...')
        

        for brand in __temp_data:
            for phone in __temp_data[brand]:
                if phone not in self.data[brand] or forced_normalize:
                    try:
                        # Normalize the announced date
                        self.data[brand][phone]['specifications']['Launch']['Announced'] = self.parse_date(self.data[brand][phone]['specifications']['Launch']['Announced'][0])
                    except Exception as e:
                        pass

                    try:
                        # Normalize the models, seperated by comma
                        if 'Misc' not in self.data[brand][phone]['specifications']:
                            self.data[brand][phone]['specifications']['Misc'] = {'Models': ['Unknown']}
                            continue
                        if 'Models' not in self.data[brand][phone]['specifications']['Misc']:
                            self.data[brand][phone]['specifications']['Misc']['Models'] = ['Unknown']
                            continue
                        self.data[brand][phone]['specifications']['Misc']['Models'] = self.data[brand][phone]['specifications']['Misc']['Models'][0].split(', ')
                    except Exception as e:
                        print(f'[Error: {e}]: {brand} {phone} models normalization failed.')
                        pass

                    try:
                        # Normalize display size
                        if 'Display' not in self.data[brand][phone]['specifications']:
                            self.data[brand][phone]['specifications']['Display'] = {'Size': ['Unknown']}
                            continue

                        if 'Size' not in self.data[brand][phone]['specifications']['Display']:
                            self.data[brand][phone]['specifications']['Display']['Size'] = ['Unknown']
                            continue
                        self.data[brand][phone]['specifications']['Display']['Size'] = self.data[brand][phone]['specifications']['Display']['Size'][0].split(', ')
                    except Exception as e:
                        print(f'[Error: {e}]: {brand} {phone} display size normalization failed.')
                        pass
                    
                    try:
                        # Normalize the battery
                        if 'Battery' not in self.data[brand][phone]['specifications']:
                            self.data[brand][phone]['specifications']['Battery'] = {'Type': ['Unknown']}
                            continue

                        if 'Type' not in self.data[brand][phone]['specifications']['Display']:
                            self.data[brand][phone]['specifications']['Battery']['Type'] = ['Unknown']
                            continue

                        self.data[brand][phone]['specifications']['Battery']['Type'] = self.data[brand][phone]['specifications']['Battery']['Type'][0].split(', ')
                    except Exception as e:
                        print(f'[Error: {e}]: {brand} {phone} battery type normalization failed.')
                        pass


        print('Saving the data...')
        with open(filename, 'w') as f:
            json.dump(self.data, f, indent=1, sort_keys=True)

        # reload the file
        with open(filename, 'r') as f:
            self.data = json.load(f)


    # tools
    @staticmethod
    def parse_date(line):
        # Define separate regex patterns for different formats
        regex_month = r"(\d{4}), (\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b)(?:.*)"
        regex_quarter = r"(\d{4}), Q(\d)(?:.*)"
        regex_year_only = r"(\d{4})(?:.*)"

        # Match based on month presence
        if match_month := re.search(regex_month, line):
            year, month = match_month.groups()
            return year, month
        # Match based on quarter presence
        elif match_quarter := re.search(regex_quarter, line):
            year, quarter = match_quarter.groups()
            month_map = {1: "January", 2: "April", 3: "July", 4: "October"}
            month = month_map[int(quarter)]
            return year, month
        # Match based on year only
        elif match_year := re.search(regex_year_only, line):
            year = match_year.group(1)
            return year, "January"
        else:
            return []



In [5]:
print("Initialising the phone data...")

handler = PhoneSpecsHandler()
#brands = handler.get_phone_brands()

print("Phone data initialised.")

Initialising the phone data...
Phone data initialised.


In [386]:
%%script echo skipping

handler.save_data(forced_normalize=True)

Loading the previous data...
Processing the data...
Saving the data...


In [6]:
handler.count_data()

12715

In [None]:
data = handler.update_all_data()

# save the data to a JSON file
handler.save_data()
print('Done.')

In [230]:
%%script echo skipping

# remove phones with Display Size lower than 2.5 inches (watch )

for brand in handler.data:
    for phone in handler.data[brand]:
        # check if Misc is present
        if 'Battery' not in handler.data[brand][phone]['specifications']:
            handler.data[brand][phone]['specifications']['Battery'] = {'Type': ['Unknown']}
            continue

        if 'Type' not in handler.data[brand][phone]['specifications']['Display']:
            handler.data[brand][phone]['specifications']['Battery']['Type'] = ['Unknown']
            continue

        handler.data[brand][phone]['specifications']['Battery']['Type'] = handler.data[brand][phone]['specifications']['Battery']['Type'][0].split(', ')

handler.save_data()

In [7]:
# get the phones announced after 2019 do not include the ones with no date -> []
latest_phones = {}
for i in handler.data:
    for j in handler.data[i]:
        if handler.data[i][j]['specifications']['Launch']['Announced'] and int(handler.data[i][j]['specifications']['Launch']['Announced'][0]) >= 2019:
            if latest_phones.get(i) is None:
                latest_phones[i] = {}
            latest_phones[i][j] = handler.data[i][j]

sum_of_phones = sum([len(x) for x in latest_phones.values()])
sum_of_phones

3246

In [8]:
# get the Display Size of phones lower than 2.5 inches 
non_phone = {}
for i in latest_phones:
    for j in latest_phones[i]:
        if (float(handler.data[i][j]['specifications']['Display']['Size'][0].split()[0]) < 2.25 and
            handler.data[i][j]['specifications']['Comms']['USB'][0] == 'No') or \
            float(handler.data[i][j]['specifications']['Display']['Size'][0].split()[0]) >= 9:
            
            if non_phone.get(i) is None:
                non_phone[i] = {}
            non_phone[i][j] = latest_phones[i][j]
            # print(handler.data[i][j]['specifications']['Display']['Size'][0], j)

# remove from the latest phones
for i in non_phone:
    for j in non_phone[i]:
        latest_phones[i].pop(j)

# get the models with (pad, tab, watch, fire_hd) in their names
non_phone_2 = {}
for i in latest_phones:
    for j in latest_phones[i]:
        if ('pad' in j.lower() and 'coolpad' not in j.lower()) or 'tab' in j.lower() or 'watch' in j.lower() or 'fire_hd' in j.lower():
            if non_phone_2.get(i) is None:
                non_phone_2[i] = {}
            non_phone_2[i][j] = latest_phones[i][j]

# remove from the latest phones
for i in non_phone_2:
    for j in non_phone_2[i]:
        latest_phones[i].pop(j)
        

# get the models that are Coming Soon
coming_phones = {}
for i in latest_phones:
    for j in latest_phones[i]:
        if latest_phones[i][j]['specifications']['Launch']['Status'][0].lower().find("coming soon") != -1:
            if coming_phones.get(i) is None:
                coming_phones[i] = {}
            coming_phones[i][j] = latest_phones[i][j]

# remove from the latest phones
for i in coming_phones:
    for j in coming_phones[i]:
        latest_phones[i].pop(j)


with open('latest_phones.json', 'w') as f:
    json.dump(latest_phones, f, indent=1, sort_keys=True)

with open('latest_phones.json', 'r') as f:
    latest_phones = json.load(f)

sum([len(x) for x in latest_phones.values()]), sum([len(x) for x in non_phone_2.values()]), sum([len(x) for x in non_phone.values()]), sum([len(x) for x in coming_phones.values()])

(2855, 39, 344, 8)

In [9]:
import bs4
import datetime

device_list = r"https://phonedb.net/sitemap/"


#get the xml file from the sitemap

sitemap = rq.get(device_list)
soup = bs4.BeautifulSoup(sitemap.text, "xml")
# get only devices from the sitemap which is 2017-12-30T22:10:18+01:00 and newer using the lastmod tag

locs = []
for url in soup.findAll("url"):
    # parse the lastmod tag to a string and compare it to the date
    date = datetime.datetime.strptime(url.lastmod.text, "%Y-%m-%dT%H:%M:%S%z")
    if date > datetime.datetime(2017, 12, 30, 22, 0, 0, 0, datetime.timezone.utc) and url.loc.text.startswith("https://phonedb.net/index.php?m=device&id="):
            locs.append(url.loc.text)


#remove the first 2 urls as they are not devices
locs.pop(0)
locs.pop(0)

print("There are " + str(len(locs)) + " devices in phonedb.net")

There are 10078 devices in phonedb.net


In [72]:
# find in the xml file the devices that are in latest_phones
import re
phones = {}
for brand in latest_phones:
    for phone in latest_phones[brand]:
        models = latest_phones[brand][phone]['specifications']['Misc']['Models']
        for model in models:
            for loc in locs:
                phone_name = latest_phones[brand][phone]['phone_name'].replace(' ', '_')

                if 'Unknown' in latest_phones[brand][phone]['specifications']['Misc']['Models'] and f'c={brand}'.lower() in loc and f'_{phone_name}_'.lower() in loc:
                    if phones.get(brand) is None:
                        phones[brand] = {}
                    if phones[brand].get(phone) is None:
                        phones[brand][phone] = [loc]
                    else:
                        phones[brand][phone].append(loc)
                    
                elif f'c={brand}'.lower() in loc and f'_{phone_name}_'.lower() in loc and f'_{model}'.lower() in loc:
                    if phones.get(brand) is None:
                        phones[brand] = {}
                    if phones[brand].get(phone) is None:
                        phones[brand][phone] = [loc]
                    else:
                        phones[brand][phone].append(loc)     
            



sum([len(x) for x in phones.values()])

924

In [73]:
phones

{'Allview': {'allview_p10_life-9621': ['https://phonedb.net/index.php?m=device&id=16531&c=allview_p10_life_dual_sim_td-lte_emea'],
  'allview_p10_max-9687': ['https://phonedb.net/index.php?m=device&id=16542&c=allview_p10_max_dual_sim_td-lte_emea'],
  'allview_p10_mini-9688': ['https://phonedb.net/index.php?m=device&id=16535&c=allview_p10_mini_dual_sim_td-lte_emea'],
  'allview_p10_pro-9686': ['https://phonedb.net/index.php?m=device&id=16543&c=allview_p10_pro_dual_sim_td-lte_emea'],
  'allview_v4_viper-9908': ['https://phonedb.net/index.php?m=device&id=16525&c=allview_v4_viper_pro_dual_sim_td-lte_emea',
   'https://phonedb.net/index.php?m=device&id=16523&c=allview_v4_viper_dual_sim_td-lte_emea'],
  'allview_v4_viper_pro-10039': ['https://phonedb.net/index.php?m=device&id=16525&c=allview_v4_viper_pro_dual_sim_td-lte_emea']},
 'Apple': {'apple_iphone_11-9848': ['https://phonedb.net/index.php?m=device&id=16214&c=apple_iphone_11_a2221_dual_sim_td-lte_jp_256gb__apple_iphone_12,1',
   'https: