In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime,timedelta
import db4
from lxml import html
import requests

def len_of_dict(dict):
    total_elements = sum(len(elements) for elements in dict.values())
    return total_elements

def clean_dict(dict):
    for key, value in dict.items():
        cleaned_value = value.split('.', 1)[0].strip()  
        cleaned_value = ' '.join(cleaned_value.split())  
        dict[key] = cleaned_value
    return dict



def find_streams(date):
    stream_query = "select * from stream_scraping_ids where is_mailru_id_active is True"
    print(stream_query)

    streams = db4.select(stream_query)

    tv_schedules_query = f"select * from tv_schedules where start_time > '{date}'"
    print(tv_schedules_query)
    
    tv_sch = db4.select(tv_schedules_query)

    # Initialize missing_stream_ids to an empty set
    missing_stream_ids = set()

    if tv_sch:
        streams_ids = {stream['stream_id'] for stream in streams}
        tv_sch_ids = {schedule['stream_id'] for schedule in tv_sch}

        missing_stream_ids = streams_ids - tv_sch_ids

        streams = [stream for stream in streams if stream['stream_id'] in missing_stream_ids]

    return streams, missing_stream_ids



def get_daily_program(streams):
    daily_programs = {}

    for i in streams:
        print(f"{i['stream_name']} started in {datetime.now()}")

        # Initialize driver and handle any setup errors
        try:
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-gpu")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        except Exception as e:
            print(f"Error initializing WebDriver for {i['stream_name']}: {e}")
            continue

        try:
            stream_scrape_id = i['mailru_id']
            url = f"https://tv.mail.ru/baku/channel/{stream_scrape_id}/"
            driver.get(url)
            time.sleep(5)
        except Exception as e:
            print(f"Error loading URL for {i['stream_name']}: {e}")
            driver.quit()
            continue

        try:
            spans = driver.find_elements(By.CSS_SELECTOR, "div.filter__list.js-more__list span.filter__item.js-group_date.js-more__item")
            target_date = datetime.now().strftime("%Y-%m-%d")

            date_found = False
            for span in spans:
                if span.get_attribute("data-value") == target_date:
                    ActionChains(driver).move_to_element(span).click(span).perform()
                    time.sleep(2)
                    date_found = True
                    break

            if not date_found:
                print(f"No data found for {i['stream_name']} on {target_date}. Moving to the next stream.")
                driver.quit()
                continue
        except Exception as e:
            print(f"Error interacting with date selection for {i['stream_name']}: {e}")
            driver.quit()
            continue

        try:
            times = driver.find_elements(By.CLASS_NAME, "p-programms__item__time")
            names = driver.find_elements(By.CLASS_NAME, "p-programms__item__name")

            if not times or not names:
                print(f"No program data available for {i['stream_name']} on {target_date}.")
                driver.quit()
                continue

            programs = {t.text: n.text for t, n in zip(times, names)}
            if programs:
                daily_programs[f"{i['stream_id']}"] = programs
        except Exception as e:
            print(f"Error extracting program data for {i['stream_name']}: {e}")
        finally:
            driver.quit()

        print(f"{i['stream_name']} finished in {datetime.now()}")

    return daily_programs

def scraping_cbc_sport(date):
    stream_query = f"select * from tv_schedules where stream_id=36 and start_time> '{date}'"
    cbc=db4.select(stream_query)
    programs={}
    if cbc:
        print('There are tv_schedules with this stream')
        return programs
    else:
    
        print(f"Cbc sport started in {datetime.now()}")
        
        url = "https://www.cbcsport.az/"
        response = requests.get(url)
        response.raise_for_status()
        page_content = response.content
        tree = html.fromstring(page_content)
        elements = tree.xpath("(//div[contains(@class, 'swiper-slide')][1]//ul/li)")

        
        for li in elements:
            time_element = li.xpath(".//span[contains(@class, 'd-flex align-items-center')]/text()")
            time = time_element[0].strip() if time_element else "No time found"
            content_element = li.xpath(".//p/text()")
            content = content_element[0].strip() if content_element else "No content found"
            # print(f"Time: {time}, Content: {content}")
            programs[time]=content
        print(f"Cbc sport finished in {datetime.now()}")
        
        return programs


def insert_code_to_programs(daily_programs):
    missing_dict = {}
    insert_programs_query = "INSERT INTO programs (name, stream_id) VALUES (%s, %s)"
    successful_inserts=0

    for stream in daily_programs:
        try:
            missing = set()
            select_programs_query = f"""SELECT * FROM programs WHERE stream_id={stream}"""
            df = db4.select(select_programs_query)

            for _, program_name in daily_programs[stream].items():
                if not any(program_name == record['name'] for record in df):
                    missing.add(program_name)

            missing_dict[stream] = missing
        except Exception as e:
            print(f"Error processing stream {stream}: {e}")
            continue
        total_missing=len_of_dict(missing_dict)

        for program_name in missing:
            try:
                db4.insert(insert_programs_query, (program_name, stream))
                successful_inserts += 1  # Increment only on successful insert
            except Exception as e:
                print(f"Error inserting program '{program_name}' for stream {stream}: {e}")
                
    if successful_inserts == total_missing:
        print("All programs were successfully inserted.")
    else:
        print(f"Some inserts failed. Expected: {total_missing}, Successful: {successful_inserts}")


    return missing_dict



def insert_code_to_tv_schedules(daily_programs):
    todays_date = datetime.now().date()
    target_date = todays_date.strftime('%Y-%m-%d')
    
    successful_inserts =0
    for stream in daily_programs:

        for start_time, program_name in daily_programs[stream].items():
            
            start_datetime = datetime.strptime(f'{target_date} {start_time}', '%Y-%m-%d %H:%M')
        
            insert_schedules = """
                INSERT INTO tv_schedules (start_time, program_id, stream_id)
                SELECT 
                    %s AS start_time, p.id AS program_id, %s AS stream_id
                FROM programs p
                WHERE p.name = %s
                LIMIT 1;
            """

            values = (start_datetime, stream, program_name)
            try:
                db4.insert(insert_schedules, values)
                successful_inserts += 1
            except Exception as e:
                print(f"Error inserting program '{program_name}' for stream {stream} at {start_time}: {e}")
    total_missing=len_of_dict(daily_programs)
    
    if successful_inserts == total_missing:
        print("All programs were successfully inserted.")
        print(f"Length of Total daily programs in {todays_date} is ",total_missing)
    else:
        print(f"Some inserts failed. Expected: {total_missing}, Successful: {successful_inserts}")
        
    return






In [2]:
todays_date = datetime.now().date()-timedelta(days=0)

streams,missing_streams=find_streams(todays_date)

daily_programs=get_daily_program(streams)

programs=scraping_cbc_sport(todays_date)
if programs:
    cbc_sport_stream_id='36'
    
    daily_programs[cbc_sport_stream_id]=programs
else:
    print('Cbc sport scraping is not succesful')



try:
    for i in daily_programs:
        daily_programs[i]= clean_dict(daily_programs[i])
except ValueError as e:
    print(e)
    
missing_dict=insert_code_to_programs(daily_programs) 

insert_code_to_tv_schedules(daily_programs)
    

select * from stream_scraping_ids where is_mailru_id_active is True
select * from tv_schedules where start_time > '2024-12-04'
AzTV started in 2024-12-04 17:39:24.634533
AzTV finished in 2024-12-04 17:39:42.819814
İctimai TV started in 2024-12-04 17:39:42.819814
İctimai TV finished in 2024-12-04 17:40:04.567383
Mədəniyyət TV started in 2024-12-04 17:40:04.567383
Mədəniyyət TV finished in 2024-12-04 17:40:25.436049
Xəzər TV started in 2024-12-04 17:40:25.436049
Xəzər TV finished in 2024-12-04 17:40:46.988929
CBC started in 2024-12-04 17:40:46.988929
CBC finished in 2024-12-04 17:41:08.265676
Idman TV started in 2024-12-04 17:41:08.265676
No data found for Idman TV on 2024-12-04. Moving to the next stream.
Cbc sport started in 2024-12-04 17:41:28.200298
Cbc sport finished in 2024-12-04 17:41:28.366578
All programs were successfully inserted.
Connection to the PostgreSQL database
Connection to the PostgreSQL database
Connection to the PostgreSQL database
Connection to the PostgreSQL datab