In [5]:
import pandas as pd
from bs4 import BeautifulSoup

import string

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s :: %(levelname)s :: %(message)s')

import time

from common import webscraping_tools

from abc import ABC

In [23]:
from abc import abstractmethod


class AptoideScrapeTemplate(ABC):

    """
    """

    def __init__(self, log: bool):

        self.log = log
        self.run()

    def run(self):

        # Extract the raw page HTML and parse with bs4
        if self.log:
            logging.info(f"Extracting raw page info from URL: {self.url}.")

        try:
            self.app_page = BeautifulSoup(webscraping_tools.user_agent_randomiser(self.url).content, 'html.parser')
            if self.log:
                time.sleep(1)
                logging.info(f"Raw page info for {self.url} successfully extracted.")
        except: 
            raise ValueError("Supplied URL returned no response")

        self._extract_main_div_html(div_name='header-desktop__HeaderContainer-xc5gow-0 eBfMrO')
        self._app_version_div_container(div_name='mini-versions__Version-sc-19sko2j-4 ikysfs')
        self._stats_div_container(div_name='mini-stats__Row-sc-188veh1-2 kSzdYC')
        self._description_container(div_name='description__Paragraph-sc-45j1b1-1 daWyZe')


    @staticmethod
    def _check_response(div_name: str, method: object):
        """
        Static helper method that checks if the HTML response is None
        """
        if not method:
            raise ValueError(f"Main container could not be extracted using supplied div-class name: {div_name}.")

    def _extract_main_div_html(self, div_name: str):

        # Filter the raw HTML to include just the main div container
        # Risk here using div name as this may change server-side
        self.div_container_main_html = self.app_page.find('div', {'class': div_name})
        self._check_response(div_name=div_name, method=self.div_container_main_html)

    def _app_version_div_container(self, div_name: str) -> str:

        self.app_version_container = self.app_page.find('div', {'class': div_name})
        self._check_response(div_name=div_name, method=self.app_version_container)

    def _stats_div_container(self, div_name: str):

        self.stats_container = self.app_page.find('div', {'class': div_name})
        self._check_response(div_name=div_name, method=self.stats_container)

    def _description_container(self, div_name: str):

        self.desc_container = self.app_page.find('p', {'class': div_name})
        self._check_response(div_name=div_name, method=self.desc_container)

In [24]:

class GetPageInfo(AptoideScrapeTemplate):

    """
    This class is used to extract app information from the aptoide website when given a specific url string for an application. 
    It compiles the relevant information as property methods. 
    After initial compiling, running the extract method will return the following in string format:
        - app name
        - app version
        - version release date
        - number of downloads 
        - app description
        - app requirements for android devices

    """

    def __init__(self, url: str, log=False):

        self.url = url
        super().__init__(log=log)

    @property
    def app_name(self) -> str:
        return self.div_container_main_html.find("h1").text

    @property
    def app_version(self) -> str:
        return self.app_version_container.find_all('div')[0].text

    @property
    def app_release_date(self) -> str:
        return ''.join(char for char in self.app_version_container.find_all('div')[1].text if char in string.digits + "-")

    @property
    def app_downloads(self) -> str:
        return self.stats_container.find_all('div')[0].text.split(' ')[0]

    @property
    def app_size(self) -> str:
        return self.stats_container.find_all('div')[2].text.split(' ')[0]

    @property
    def app_requirements(self) -> str:
        return self.stats_container.find_all('div')[4].text

    @property
    def app_description(self) -> str:

        paragraphs = []
        for paragraph in self.desc_container:
            if len(paragraph) > 2:
                paragraphs.append(paragraph.text)

        return '\n'.join(paragraphs)

    @property
    def app_display_photo_url(self):
        return self.div_container_main_html.find('img')['src']


In [25]:
s = GetPageInfo(url='https://instagram.en.aptoide.com/app')


In [None]:
class ScrapeAptoidWebsite(ScrapeAppPage):

    """
    This is an abstract class that can inherit from the ScrapeAppPage class. 
    This class is used to scrape all apps on the aptoid website and return a data frame with all relevant information.
    """
    def __init__(self): 
        super().__init__()

    HOME_URL = 'https://en.aptoide.com/group/applications'

    

    

In [27]:
url = 'https://linkedin-android.en.aptoide.com/app'

# Extract the raw page HTML and parse with bs4
logging.info(f"Extracting raw page info from URL: {url}.")

try:
    app_page = BeautifulSoup(webscraping_tools.user_agent_randomiser(url).content, 'html.parser')
    time.sleep(1)
    logging.info(f"Raw page info for {url} successfully extracted.")
except: 
    raise ValueError("Supplied URL returned no response")


2022-08-29 15:57:04,067 :: INFO :: Extracting raw page info from URL: https://linkedin-android.en.aptoide.com/app.
2022-08-29 15:57:06,440 :: INFO :: Raw page info for https://linkedin-android.en.aptoide.com/app successfully extracted.


In [28]:
# Filter the raw HTML to include just the main div container
# Risk here using div name as this may change server-side

div_container_main_str = 'header-desktop__HeaderContainer-xc5gow-0 eBfMrO'

div_container_main_html = app_page.find('div', {'class': div_container_main_str})
if not div_container_main_html:
    logging.debug(f"Main container could not be extracted using supplied div-class name: {div_container_main_str}.")

In [29]:




app_name = div_container_main_html.find("h1").text

In [30]:
VERSION_CONTAINER_CLS_NAME = "mini-versions__Version-sc-19sko2j-4 ikysfs"

version_container = app_page.find('div', {'class': VERSION_CONTAINER_CLS_NAME})

In [31]:
app_version = version_container.find_all('div')[0].text
app_version_release = ''.join(char for char in version_container.find_all('div')[1].text if char in string.digits + "-")

In [32]:
app_version_release

'04-08-2022'

In [33]:
STATS_CONTAINER_CLS_NAME = 'mini-stats__Row-sc-188veh1-2 kSzdYC'

stats_container = app_page.find('div', {'class': STATS_CONTAINER_CLS_NAME})

app_downloads = stats_container.find_all('div')[0].text.split(' ')[0]

app_size = stats_container.find_all('div')[2].text.split(' ')[0]

android_requirements = stats_container.find_all('div')[4].text

In [34]:
android_requirements

'8.1.0+'

In [35]:
DESC_CONTAINER_CLS_NAME = 'description__Paragraph-sc-45j1b1-1 daWyZe'

app_description = []
stats_container = app_page.find('p', {'class': DESC_CONTAINER_CLS_NAME})

for paragraph in stats_container.contents:
    if len(paragraph) > 2:
        app_description.append(paragraph.text)

app_description = '\n\n'.join(app_description)

In [36]:
app_description = '\n\n'.join(app_description)

In [40]:
div_container_main_html.find('img')['src']

'https://cdn6.aptoide.com/imgs/9/d/4/9d42391daecb952cdffaad185d9aae4b_icon.png?w=160'

In [38]:
stats_container

<img alt="Games" class="side-feature__FeatureIcon-zdzgtp-2 gFRWaT" src="https://cdn-mobile.aptoide.com/static/imgs/games-ic.svg"/>