<a href="https://colab.research.google.com/github/TK-Problem/Python-mokymai/blob/master/Scripts/autoplius.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Importuoti paketus

# playwright biblioteka naudojama importuoti html kodą
!pip install playwright==1.25.00
!playwright install-deps
!playwright install webkit
!pip install nest_asyncio

# playwright veikia TIK asyncio režimu
import nest_asyncio
nest_asyncio.apply()
import asyncio

# importuoti playwright versiją
from playwright.async_api import async_playwright

# atsisiųsti html kodą paprastai
import requests

# kartais reikia palaikyti kurį laiką programą veikiančią
import time

# bs4 naudojama iš HTML ištraukti reikiamą informaciją
from bs4 import BeautifulSoup

# paketai dirbti su skaičiais ir duomenimis
import pandas as pd
import numpy as np

# stebėti programos progresą
from tqdm import tqdm

# skirta interaktyvumui
import ipywidgets as widgets
from IPython.display import display

# clear output komanda naudojama išvalyti informacijai
from IPython.display import clear_output
clear_output()

## Visi skelbimai

Sugeneruoti sąrašą skelbimų, kurie šiuo metu yra rinkoje.

In [2]:
#@title Autoplius skelbimų funkcija
async def get_autoplius_ads():
    """
    This function returns all autoplius ads for each car manufacture and model
    Output:
      returns pandas DataFrame
    """
    async with async_playwright() as p:

        # create webdriver/webkit
        browser = await p.webkit.launch()

        # create user agent for the webdriver
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'

        # create new page, i.e. new table in your browser
        page = await browser.new_page(user_agent=user_agent)

        # generate URL (search from 2000s)
        url = "https://m.autoplius.lt/paieska/reiksmes-parinkimas/naudoti-automobiliai?field_name=make_id&title_page=1&qt=&qt_autocomplete=&category_id=2&make_date_from=2000"
        
        # visit page
        await page.goto(url, timeout=60000)

        # implicit wait
        time.sleep(2)

        # save image to your enviroment (for debuging)
        # one can uncomment this line
        await page.screenshot(path="page_status.png")

        # click cookie button
        await page.click("//button[@id='onetrust-accept-btn-handler']")

        # get page html contents
        page_source = await page.content()

        # convert to bs4 object
        _soup = BeautifulSoup(page_source, "lxml")

        # find all available search objects, i.e. car manufacturers
        car_manu = _soup.find_all("a", {'class': 'search-dropdown'})

        # create tmp. list to store data
        lst = list()

        # iterate over car manufacturers
        for _car_element in tqdm(car_manu):
          # get car ad count
          ad_cnt = int(_car_element.span.text)
          
          # check if maker has at least 50 adds
          if ad_cnt >= 50:
            # get href
            _href = _car_element['href']

            # get car maker
            name_manu = _car_element['title']

            # visit page
            await page.goto(_href, timeout=60000)

            # get page html contents
            page_source = await page.content()

            # convert to bs4 object
            _soup = BeautifulSoup(page_source, "lxml")

            # get available ads
            _car_types = _soup.find("section", {"class": 'panel'}).find_all('a')

            # iterate over car search types
            for _ad_element in _car_types[1:]:
              # get search subtitle
              search_subtype = _ad_element['title']

              # # get href
              # search_id = _ad_element['href'].split("&")
              
              # # get search id
              # if len(search_id) == 1:
              #   search_id = search_id[0]
              # else:
              #   search_id = search_id[-2]

              # count add names
              subtype_cnt = int(_ad_element.span.text)

              lst.append([name_manu, search_subtype, subtype_cnt, _ad_element['href']])

        # save image to your enviroment (for debuging)
        # one can uncomment this line
        # await page.screenshot(path="page_status.png")
        
        # close webkit
        await browser.close()

        # return results
        return pd.DataFrame(lst, columns=['Manufacturer', 'CarMake', 'AdCount', 'SearchHref'])

In [3]:
# run script to get all available ads with counts
df_ads = asyncio.run(get_autoplius_ads())

100%|██████████| 80/80 [00:35<00:00,  2.27it/s]


In [4]:
#@title Skelbimų statistika
# remove comments if you want to analyze data manualy
# CTRL + /
# df_ads

## Detalūs skelbimai

Nuskaityti duomenis iš visų detalių skelbimų.

In [5]:
#@title Pasirinkti skelbimą

# select search term with atleast 20 live ads
_df = df_ads.loc[df_ads.AdCount >= 20].copy()

# create search term
_df['Search'] = _df['Manufacturer'] + " " + _df['CarMake'] + " (" + _df['AdCount'].astype(str) + ")"

# create widget
w = widgets.Dropdown(
    options=_df.Search.values.tolist(),
    value=_df.Search.values.tolist()[0],
    description='Skelbimas:',
)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print(f"Pasirinktas {change['new'].split('(')[0]}modelis paieškai.")

# represent changes
w.observe(on_change)

# display widget
display(w)

Dropdown(description='Skelbimas:', options=('Alfa Romeo 159 (30)', 'Audi A1 (31)', 'Audi A3 (327)', 'Audi A4 (…

In [6]:
#@title Skelbimų nuskaitymo funkcija

# function code
async def search_autoplius_ads(url):
    """
    This function returns all autoplius ads for each car manufacture and model
    Output:
      returns pandas DataFrame
    """
    async with async_playwright() as p:

        # create webdriver/webkit
        browser = await p.webkit.launch()

        # create user agent for the webdriver
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'

        # create new page, i.e. new table in your browser
        page = await browser.new_page(user_agent=user_agent)
        
        # visit page
        await page.goto(url, timeout=60000)

        # implicit wait
        time.sleep(2)

        # save image to your enviroment (for debuging)
        # one can uncomment this line
        await page.screenshot(path="page_status.png")

        # click cookie button
        await page.click("//button[@id='onetrust-accept-btn-handler']")

        # implicit wait
        time.sleep(2)

        # click submit search button
        await page.click("//button[@type='submit']")

        # implicit wait
        time.sleep(5)

        # save image to your enviroment (for debuging)
        # one can uncomment this line
        await page.screenshot(path="page_status_start.png")
        
        # close webkit
        await browser.close()

        # return results
        return 

In [7]:
# get search term
_search_url = _df.loc[_df.Search == w.value, "SearchHref"].values[-1]

# run script to record data for all available car manufacturer ads 
df_ads_all = asyncio.run(search_autoplius_ads(_search_url))

In [8]:
_search_url

'https://m.autoplius.lt/naudoti-automobiliai?category_id=2&field_name=make_id&make_date_from=2000&make_id%5B103%5D=1400&title_page=1#quick-search'