In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import time
from datetime import datetime, timedelta
from kafka import KafkaProducer
import json
producer = KafkaProducer(
        bootstrap_servers=['localhost:9092'],
        value_serializer=lambda x: json.dumps(x).encode('utf-8')
    )
class WebCrawler:
    def __init__(self, url):
        self.url = url
        self.driver = None
        
    def initialize_driver(self):
        try:
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            return True
        except Exception as e:
            print(f"Failed to initialize driver: {str(e)}")
            return False

    def start_crawling(self):
        if not self.initialize_driver():
            return None
            
        try:
            
            max_retries = 3
            for attempt in range(max_retries):
                try:
                    self.driver.get(self.url)
                    break
                except WebDriverException as e:
                    if attempt == max_retries - 1:
                        raise e
                    time.sleep(2)

            today = datetime.now()
            all_data = {}
            print(today)
            for i in range(2):  
                current_date = today + timedelta(days=i)
                date_str = current_date.strftime('%Y-%m-%d')
                print(f"\nProcessing date: {date_str}")

                date_input = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.NAME, "flight_date"))
                )
                self.driver.execute_script(f"arguments[0].value = '{date_str}';", date_input)
                time.sleep(1)  
                search_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.btn-filter[type="submit"]'))
                )
                
                search_button.click()
                time.sleep(10)
                try:
                    WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, 'tbody'))
                    )
                    
                    data = self.get_crawl_data()
                    for flight in data['data']:
                        information = {
                            'date':date_str,# ngày
                            'scheduled_time':flight[0],# Giờ kế hoạch ví dụ: 16:35
                            'updated_time':flight[1],# Giờ cập nhật ví dụ: 17:40
                            'route':flight[2],# Chặng bay ví dụ: DAD-HAN
                            'flight_id':flight[4],# Mã chuyến bay ví dụ: VJ528
                            'counter':flight[5],# Quầy ví dụ: 21-28
                            'gate':flight[6],# Cổng ví dụ: 7
                            'status':flight[8]# Trạng thái: OPN/CLS
                        }
                        producer.send('flights',value=information)
                    if data:
                        all_data[date_str] = data
                        print(data)
                        print(f"Successfully collected data for {date_str}")
                    
                except TimeoutException:
                    print(f"No data found for date {date_str}")
                    continue

                time.sleep(2)  

            return all_data
            
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            return None
        finally:
            self.cleanup()
    
    def get_crawl_data(self):
        max_retries = 3
        for attempt in range(max_retries):
            try:
                print('Starting data extraction...')
                
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'table.table.table-striped'))
                )
                
                headers = []
                header_elements = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'table.table.table-striped thead tr th'))
                )
                for header in header_elements:
                    try:
                        headers.append(WebDriverWait(self.driver, 5).until(
                            EC.visibility_of(header)
                        ).text.strip())
                    except:
                        continue
                
                data_rows = []
                row_elements = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'table.table.table-striped tbody tr'))
                )
                
                for row in row_elements:
                    try:
                        cells = WebDriverWait(row, 5).until(
                            EC.presence_of_all_elements_located((By.TAG_NAME, 'td'))
                        )
                        
                        row_data = []
                        for cell in cells:
                            try:
                                cell_text = WebDriverWait(self.driver, 5).until(
                                    EC.visibility_of(cell)
                                ).text.strip()
                                row_data.append(cell_text)
                            except:
                                row_data.append("")  
                        if row_data and any(row_data):  
                            data_rows.append(row_data)
                            
                    except Exception as row_error:
                        print(f"Error processing row: {str(row_error)}")
                        continue
                
                if headers and data_rows:  
                    return {
                        'headers': headers,
                        'data': data_rows
                    }
                else:
                    raise Exception("No data found in table")
                    
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt == max_retries - 1:
                    print("Max retries reached, returning None")
                    return None
                time.sleep(2)  
            return None
    
    def cleanup(self):
        try:
            if self.driver:
                self.driver.quit()
        except Exception as e:
            print(f"Error during cleanup: {str(e)}")

url = "https://vietnamairport.vn/thong-tin-lich-bay"
crawler = WebCrawler(url)
result = crawler.start_crawling()


if result:
    print(result)
    


2024-12-27 21:03:35.887331

Processing date: 2024-12-27
Starting data extraction...
{'headers': ['Giờ kế hoạch\nScheduled Time', 'Giờ cập nhật\nUpdated Time', 'Chặng bay\nRoute', 'Hãng hàng không\nAirlines', 'Chuyến bay\nFlight', 'Quầy\nCounter', 'Cổng\nGate', 'Sảnh\nTerminal', 'Tình trạng\nStatus'], 'data': [['13:15', '--:--', 'DAD-BKK', '', 'VZ961', '37-39', '2', '', 'CLS'], ['18:10', '--:--', 'DAD-BKK', '', 'VZ963', '37-39', '7', '', 'CLS'], ['10:40', '--:--', 'DAD-BKK', '', 'VZ965', '37-39', '6', '', 'CLS'], ['02:30', '--:--', 'DAD-CGK', '', 'QG8731', '9-12', '3', '', 'CLS'], ['02:15', '--:--', 'DAD-CJJ', '', 'RF532', '49-52', '9', '', 'CLS'], ['01:10', '--:--', 'DAD-CJJ', '', 'TW182', '34-35', '4', '', 'CLS'], ['19:10', '--:--', 'DAD-DLI', '', 'VN1955', '', '', '', 'OPN'], ['09:40', '--:--', 'DAD-DMK', '', 'FD635', '48-50', '7', '', 'CLS'], ['12:40', '--:--', 'DAD-DMK', '', 'FD637', '49-51', '3', '', 'CLS'], ['18:50', '--:--', 'DAD-DMK', '', 'FD639', '46-48', '5', '', 'CLS'], ['09

In [3]:
import json
with open('iata.json') as f:
    iata = json.load(f)
    
iata.items()

dict_items([('AAN', {'region_name': 'Abu Zaby', 'airport': 'Al Ain International Airport'}), ('AUH', {'region_name': 'Abu Zaby', 'airport': 'Abu Dhabi International Airport'}), ('AYM', {'region_name': 'Abu Zaby', 'airport': 'Yas Island Seaplane Base'}), ('AZI', {'region_name': 'Abu Zaby', 'airport': 'Al Bateen Executive Airport'}), ('DHF', {'region_name': 'Abu Zaby', 'airport': 'Al Dhafra Air Base'}), ('XSB', {'region_name': 'Abu Zaby', 'airport': 'Sir Bani Yas Airport'}), ('ZDY', {'region_name': 'Abu Zaby', 'airport': 'Dalma Airport'}), ('FJR', {'region_name': 'Al Fujayrah', 'airport': 'Fujairah International Airport'}), ('SHJ', {'region_name': 'Ash Shariqah', 'airport': 'Sharjah International Airport'}), ('DCG', {'region_name': 'Dubayy', 'airport': 'Dubai Creek Seaplane Base'}), ('DJH', {'region_name': 'Dubayy', 'airport': 'Jebel Ali Seaplane Base'}), ('DWC', {'region_name': 'Dubayy', 'airport': 'Al Maktoum International Airport'}), ('DXB', {'region_name': 'Dubayy', 'airport': 'Dubai

In [5]:
for i in iata.items():
    print(i)
    break

('AAN', {'region_name': 'Abu Zaby', 'airport': 'Al Ain International Airport'})


In [7]:
from kafka import KafkaProducer
import json
import time

producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Test data
test_message = {
    "date": "2024-03-26",
    "Scheduled_Time": "10:00",
    "Updated_Time": "10:30",
    "Route": "HAN-SGN",
    "Flight": "VN123",
    "Counter": "A1",
    "Gate": "G1",
    "Status": "Delayed"
}

# Gửi message
producer.send('flights', test_message)
producer.flush()
print("Message sent successfully")

Message sent successfully
