In [5]:
import requests as rq
from urllib.parse import quote
import pandas as pd
import re
import time
import random
import json


In [6]:
# SCRAPING THE IDEA, BLOCKED BY THE WEBSITE

In [10]:
class PhoneSpecsHandler:
    """
    A class that handles the retrieval of phone specifications using an API.

    Attributes:
        api (str): The URL of the API.
        headers (dict): The headers to be used for the requests.
        filename (str): The name of the file to save the retrieved data.

    Methods:
        get_phone_brands: Retrieves the available phone brands from the API.
        get_phone_models: Retrieves the phone models for a specific brand from the API.
        get_phone_specs: Retrieves the specifications of a specific phone from the API.
        update_all_data: Retrieves the specifications of all phones from the API.
        save_data: Saves the retrieved phone specifications to a JSON file.
    """

    def __init__(self, filename='phone_data.json'):
        self.api = 'https://phone-specs-api.vercel.app'
        self.headers = {'Content-Type': 'application/json',
                        'User-Agent': 'Mozilla/5.0'}
        self.filename = filename

        try:
            with open(self.filename, 'r') as f:
                self.data = json.load(f)
        except Exception as e:
            print(f'[Error: {e}]: Creating new file...')
            self.data = dict()

            # save the file
            self.save_data()


        

    def __get(self, url):
        """
        Retrieves the JSON response from the API.

        Args:
            url (str): The URL to be requested.

        Returns:
            dict: The JSON response from the API.
        """

        # throttle the requests to avoid getting blocked
        # add random delay between 0.5 and 2 seconds
        start = time.time()
        response = rq.get(url, headers=self.headers, timeout=10).json()
        end = time.time()

        duration = end - start
        if duration < 2:
            d = int((2 - duration)*1000)
            time.sleep(random.randint(d, d + 500)/1000)

        return response
    
    def __try_get(self, url):
        retries = 0
        while True:
            if retries == 5:
                print(f'[Error: {result["error"]}]: Retries exceeded. Skipping...')
                return None
            
            try:
                result = self.__get(url)

                if str(result['status']) == 'True':
                    return result
                elif 'Please provide a valid phone slug!' in result['error']:                
                    print(f'[Error: {result["error"]}]: Retrying in 5s...')
                    retries += 1
                    time.sleep(5)

            except Exception as e:
                print(f'[Error: {e.args[0]}]: Retrying in 5s...')
                retries += 1
                time.sleep(5)                


    def get_phone_brands(self):
        """
        Retrieves the available phone brands from the API.

        Returns:
            dict: The JSON response containing the available phone brands.
        """
        url = self.api + '/brands'
        response = self.__try_get(url)
        
        return response

    def get_phone_models(self, brand_name, brand_id):
        """
        Retrieves the phone models for a specific brand from the API.

        Args:
            brand_name (str): The name of the phone brand.
            brand_id (int): The ID of the phone brand.

        Returns:
            dict: The JSON response containing the phone models for the specified brand.
        """
        url = self.api + f'/brands/{quote(brand_name.lower().replace(" ", "_"))}-phones-{brand_id}'
        phones = self.__try_get(url)
        
        print(phones, url, sep='\n')

        last_page = int(phones['data']['last_page']) 

        print(f'last page: {last_page}')
        if last_page == 1:
            return phones

        for page in range(1, last_page):
            url_page = self.api + f'/brands/{quote(brand_name.lower().replace(" ", "_"))}-phones-f-{brand_id}-0-p{page+1}'
            response_page = self.__try_get(url_page)

            print(phones)

            phones['data']['phones'].extend(response_page['data']['phones'])

        phones['data'].pop('current_page')
        phones['data'].pop('last_page')

        return phones
    
    def get_phone_models(self, brand_json):
            """
            Retrieves the phone models for a given brand from the GSMArena API.

            Args:
                brand_json (dict): The JSON response containing the brand information.

            Returns:
                dict: The updated brand JSON response with the phone models.

            """

            phones = self.__try_get(brand_json['detail'])          

            # Use re.search to find the first match
            match = re.search(r"(.*?)-(\d+)", brand_json['brand_slug'])

            last_page = int(phones['data']['last_page']) 

            if match:
                # Extract the groups from the match
                first_part = match.group(1)  # Text before the last hyphen
                number = int(match.group(2))  # Number after the last hyphen

                print(f'Page 1 out of {last_page} for {brand_json["brand_name"]} is processing.')

                for page in range(1, last_page):
                    url_page = self.api + f'/brands/{quote(first_part)}-f-{number}-0-p{page+1}'
                    response_page = self.__try_get(url_page)

                    if response_page:
                        phones['data']['phones'].extend(response_page['data']['phones'])
                        print(f'Page {page+1} out of {last_page} done.')
                    else:
                        print(f'Page {page+1} out of {last_page} failed to process. Skipping...')


            phones['data'].pop('current_page')
            phones['data'].pop('last_page')

            return phones
    
    def get_phone_specs(self, phone_name):
        """
        Retrieves the specifications of a specific phone from the API.

        Args:
            phone_name (str): The name of the phone.

        Returns:
            dict: The JSON response containing the specifications of the specified phone.
        """
        url = self.api + f'/{quote(phone_name)}'
        response = self.__try_get(url)
        
        return response
    
    def get_result(self, phone_name):
        """
        Retrieves the search results for a specific phone from the API.

        Args:
            phone_name (str): The name of the phone to search for.

        Returns:
            dict: The JSON response containing the search results for the specified phone.
        """
        url = self.api + f'/search?query={quote(phone_name)}'
        response = self.__try_get(url)

        return response
    
    def update_all_data(self, forced=False):
        print('Phone specs scraping at 2s delay per request. Please wait...')

        if forced:
            self.data = dict()

        brands = self.get_phone_brands()['data'][8:9]


        phone_sum =  sum([int(x) for x in [brand['device_count'] for brand in brands]])

        print(f'There are {phone_sum} phones to be processed.')

        brands_len = len(brands)

        current_phone = 0
        total_time = 0
        
        for i, models in enumerate(brands):
            print(f'{models["brand_name"]} is now processing.')

            if self.data.get(models["brand_name"]) is None:
                self.data[models["brand_name"]] = {}

            phones = self.get_phone_models(models)['data']['phones']
            phones_len = len(phones)
            

            for j, phone in enumerate(phones):
                if phone['slug'] in self.data[models["brand_name"]].keys():
                    print(f'[{round(((current_phone := current_phone + 1)/phone_sum)*100, 4): 0.4f}% done; {j+1}/{phones_len} {models["brand_name"]} phones; {current_phone}/{phone_sum} phones]: {phone["phone_name"]} already exists. Skipping...')
                    continue

                # initial time
                start = time.time()
                
                specs = self.__try_get(phone['detail'])     
                if specs:        
                    print(specs['data'])
                    self.data[models["brand_name"]][phone['slug']] = specs['data']
                    # clean all phones' specs data
                    self.data[models["brand_name"]][phone['slug']]['specifications'] = {j['title']: {k['key']: k['val'] for k in j['specs']} for j in  specs['data']['specifications']}

                    # end time
                    end = time.time()
                    remaining_time = ((total_time := total_time + end - start) / (j + 1)) * (phone_sum - (j + 1))
                    eta = time.time() + remaining_time

                    # convert hours to hours and minutes and seconds
                    print(f'[{round(((current_phone := current_phone + 1)/phone_sum)*100, 4): 0.4f}% done; ETA: {time.strftime("%I:%M:%S %p", time.localtime(eta))}, Remaining Time: {time.strftime("%H:%M:%S", time.gmtime(remaining_time))}; {j+1}/{phones_len} {models["brand_name"]} phones; {end-start:.3f}s; {current_phone}/{phone_sum} phones]: {phone["phone_name"]} done processing.')                
                else:
                    print(f'[{round(((current_phone := current_phone + 1)/phone_sum)*100, 4): 0.4f}% done; {j+1}/{phones_len} {models["brand_name"]} phones; {current_phone}/{phone_sum} phones]: {phone["phone_name"]} failed to process. Skipping...')


            print(f'[{round(((current_phone := current_phone + 1)/phone_sum)*100, 4): 0.4f}% done; {j+1}/{phones_len} {models["brand_name"]} phones; {current_phone}/{phone_sum} phones]: {models["brand_name"]} done processing, {brands_len - i} brands left.')

            # process/ normalize the data


        return self.data
    

    def count_data(self):
        return sum([len(self.data[brand]) for brand in self.data])
            

    def save_data(self):
        with open(self.filename, 'w') as f:
            json.dump(self.data, f, indent=1, sort_keys=True)


In [11]:
print("Initialising the phone data...")

handler = PhoneSpecsHandler()
brands = handler.get_phone_brands()

print("Phone data initialised.")

Initialising the phone data...
Phone data initialised.


In [12]:
handler.count_data()

2

In [13]:
data = handler.update_all_data()


Phone specs scraping at 2s delay per request. Please wait...
There are 4 phones to be processed.
AT&T is now processing.
Page 1 out of 1 for AT&T is processing.
{'brand': 'AT&T', 'phone_name': 'Quickfire', 'thumbnail': 'https://fdn2.gsmarena.com/vv/bigpic/quickfire.jpg', 'phone_images': ['https://fdn2.gsmarena.com/vv/pics/att/quickfire-1.jpg', 'https://fdn2.gsmarena.com/vv/pics/att/quickfire-2.jpg'], 'release_date': 'Released 2008, November', 'dimension': '136g, 18mm thickness', 'os': 'Feature phone', 'storage': '29MB storage, microSD slot', 'specifications': [{'title': 'Network', 'specs': [{'key': 'Technology', 'val': ['GSM / HSPA']}, {'key': '2G bands', 'val': ['GSM 850 / 900 / 1800 / 1900 ']}, {'key': '3G bands', 'val': ['HSDPA 850 / 1900 / 2100 ']}, {'key': 'Speed', 'val': ['HSPA 3.6/0.384 Mbps']}]}, {'title': 'Launch', 'specs': [{'key': 'Announced', 'val': ['2008, November. Released 2008, November']}, {'key': 'Status', 'val': ['Discontinued']}]}, {'title': 'Body', 'specs': [{'key'

{'AT&T': {'at&t_8525-2601': {'brand': 'AT&T',
   'dimension': '176g, 22mm thickness',
   'os': 'Microsoft Windows Mobile 6 Professional',
   'phone_images': ['https://fdn2.gsmarena.com/vv/pics/att/att-8525-1.jpg',
    'https://fdn2.gsmarena.com/vv/pics/att/att-8525-2.jpg'],
   'phone_name': '8525',
   'release_date': 'Released 2006, June',
   'specifications': {'Battery': {'Stand-by': ['Up to 250 h'],
     'Talk time': ['Up to 5 h'],
     'Type': ['Removable Li-Po 1350 mAh battery']},
    'Body': {'Dimensions': ['113 x 58 x 22 mm (4.45 x 2.28 x 0.87 in)'],
     'Keyboard': ['QWERTY'],
     'SIM': ['Mini-SIM'],
     'Weight': ['176 g (6.21 oz)']},
    'Comms': {'Bluetooth': ['2.0, A2DP'],
     'Infrared port': ['Yes'],
     'Positioning': ['No'],
     'Radio': ['No'],
     'USB': ['1.1'],
     'WLAN': ['Wi-Fi 802.11b/g']},
    'Display': {'Other': ['Handwriting recognition\n'],
     'Resolution': ['240 x 320 pixels, 4:3 ratio (~143 ppi density)'],
     'Size': ['2.8 inches, 42 x 57 mm, 

In [14]:
# save the data to a JSON file
handler.save_data()
print('Done.')
    

Done.


In [62]:
[[[z['key'] for z in y['specs']] for y in x['specifications']] for x in data['Samsung']]

[[['Technology', '2G bands', '3G bands', '4G bands', '5G bands', 'Speed'],
  ['Announced', 'Status'],
  ['Dimensions', 'Weight', 'Build', 'SIM', 'Other'],
  ['Type', 'Size', 'Resolution', 'Protection', 'Other'],
  ['OS', 'Chipset', 'CPU', 'GPU'],
  ['Card slot', 'Internal', 'Other'],
  ['Quad', 'Features', 'Video'],
  ['Single', 'Features', 'Video'],
  ['Loudspeaker', '3.5mm jack', 'Other'],
  ['WLAN', 'Bluetooth', 'Positioning', 'NFC', 'Radio', 'USB'],
  ['Sensors', 'Other'],
  ['Type', 'Charging'],
  ['Colors', 'Models', 'SAR', 'SAR EU', 'Price']],
 [['Technology', '2G bands', '3G bands', '4G bands', '5G bands', 'Speed'],
  ['Announced', 'Status'],
  ['Dimensions', 'Weight', 'Build', 'SIM', 'Other'],
  ['Type', 'Size', 'Resolution', 'Protection', 'Other'],
  ['OS', 'Chipset', 'CPU', 'GPU'],
  ['Card slot', 'Internal', 'Other'],
  ['Triple', 'Features', 'Video'],
  ['Single', 'Features', 'Video'],
  ['Loudspeaker', '3.5mm jack', 'Other'],
  ['WLAN', 'Bluetooth', 'Positioning', 'NFC', 

In [15]:
p = [handler.get_phone_models(v) for v in handler.get_phone_brands()['data']]

Page 1 out of 4 for Acer is processing.
Page 2 out of 4 done.


KeyboardInterrupt: 

In [10]:
print(sum([len(x) for x in handler.data.values()]), sum([len(x['data']['phones']) for x in p]))


12712 10127


In [16]:
[[(handler.data[y][x]['specifications']['Launch']['Announced']) for x in handler.data[y]] for y in handler.data]

[[['2006, June'],
  ['2007, October. Released 2007, October'],
  ['2008, November. Released 2008, November'],
  ['2007, November']]]

2007, October. Released 2007, October -> 2007, October
2004, Q1 -> 2004, January
2022 -> 2022, January


In [12]:
a = p[0]['data']['phones']


# enumerate handler.data.items() and p
b = handler.data.keys()


In [13]:
len([x['data']['phones'] for x in p]), len(handler.data.values())

(122, 122)

In [118]:
handler.save_data()

In [25]:
import re

def parse_date(line):
    # Define separate regex patterns for different formats
    regex_month = r"(\d{4}), (\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b)(?:.*)"
    regex_quarter = r"(\d{4}), Q(\d)(?:.*)"
    regex_year_only = r"(\d{4})(?:.*)"

    # Match based on month presence
    if match_month := re.search(regex_month, line):
        year, month = match_month.groups()
        return year, month
    # Match based on quarter presence
    elif match_quarter := re.search(regex_quarter, line):
        year, quarter = match_quarter.groups()
        month_map = {1: "January", 2: "April", 3: "July", 4: "October"}
        month = month_map[int(quarter)]
        return year, month
    # Match based on year only
    elif match_year := re.search(regex_year_only, line):
        year = match_year.group(1)
        return year, "January"
    else:
        print(f"No valid format found in: {line}")
        return False



('2014', 'June')

In [16]:
for i, (v1, v2) in enumerate(zip([x['data']['phones'] for x in p], handler.data.values())):
    print(i, len(v1), len(v2))

0 100 4
1 300 100
2 157 157
3 25 25
4 47 47
5 118 118
6 43 43
7 202 202
8 4 369
9 9 20
10 35 35
11 28 28
12 61 9
13 92 61
14 89 92
15 300 89
16 10 10
17 20 5
18 5 22
19 22 229
20 229 12
21 12 46
22 46 57
23 57 20
24 20 66
25 66 15
26 15 69
27 69 40
28 40 22
29 22 4
30 4 2
31 2 5
32 5 63
33 63 95
34 95 27
35 27 41
36 59 287
37 206 59
38 41 206
39 287 441
40 300 61
41 34 125
42 37 18
43 61 15
44 125 3
45 18 60
46 5 24
47 15 660
48 32 145
49 3 9
50 60 246
51 24 5
52 145 31
53 9 41
54 246 72
55 300 289
56 31 32
57 41 12
58 72 25
59 289 8
60 32 601
61 12 73
62 25 30
63 8 3
64 300 576
65 5 2
66 73 3
67 3 45
68 30 66
69 300 294
70 2 19
71 3 32
72 45 17
73 66 123
74 294 72
75 19 10
76 32 229
77 17 113
78 123 30
79 72 56
80 10 90
81 229 21
82 113 2
83 30 184
84 56 120
85 90 1381
86 21 19
87 2 25
88 184 72
89 120 94
90 300 18
91 19 158
92 25 188
93 72 120
94 94 64
95 18 61
96 158 132
97 188 7
98 120 30
99 64 1
100 61 35
101 132 86
102 7 30
103 30 31
104 1 17
105 35 87
106 86 5
107 30 99
108 17 4