In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime,timedelta
import db4
from lxml import html
import requests

def len_of_dict(dict):
    total_elements = sum(len(elements) for elements in dict.values())
    return total_elements

def clean_dict(dict):
    for key, value in dict.items():
        cleaned_value = value.split('.', 1)[0].strip()  
        cleaned_value = ' '.join(cleaned_value.split())  
        dict[key] = cleaned_value
    return dict



def find_streams(date):
    stream_query = "select * from stream_scraping_ids where is_mailru_id_active is True"
    print(stream_query)

    streams = db4.select(stream_query)

    tv_schedules_query = f"select * from tv_schedules where start_time > '{date}'"
    print(tv_schedules_query)
    
    tv_sch = db4.select(tv_schedules_query)

    # Initialize missing_stream_ids to an empty set
    missing_stream_ids = set()

    if tv_sch:
        streams_ids = {stream['stream_id'] for stream in streams}
        tv_sch_ids = {schedule['stream_id'] for schedule in tv_sch}

        missing_stream_ids = streams_ids - tv_sch_ids

        streams = [stream for stream in streams if stream['stream_id'] in missing_stream_ids]

    return streams, missing_stream_ids



def get_daily_program(streams):
    daily_programs = {}

    for i in streams:
        print(f"{i['stream_name']} started in {datetime.now()}")

        # Initialize driver and handle any setup errors
        try:
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-gpu")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        except Exception as e:
            print(f"Error initializing WebDriver for {i['stream_name']}: {e}")
            continue

        try:
            stream_scrape_id = i['mailru_id']
            url = f"https://tv.mail.ru/baku/channel/{stream_scrape_id}/"
            driver.get(url)
            time.sleep(5)
        except Exception as e:
            print(f"Error loading URL for {i['stream_name']}: {e}")
            driver.quit()
            continue

        try:
            spans = driver.find_elements(By.CSS_SELECTOR, "div.filter__list.js-more__list span.filter__item.js-group_date.js-more__item")
            target_date = datetime.now().strftime("%Y-%m-%d")

            date_found = False
            for span in spans:
                if span.get_attribute("data-value") == target_date:
                    ActionChains(driver).move_to_element(span).click(span).perform()
                    time.sleep(2)
                    date_found = True
                    break

            if not date_found:
                print(f"No data found for {i['stream_name']} on {target_date}. Moving to the next stream.")
                driver.quit()
                continue
        except Exception as e:
            print(f"Error interacting with date selection for {i['stream_name']}: {e}")
            driver.quit()
            continue

        try:
            times = driver.find_elements(By.CLASS_NAME, "p-programms__item__time")
            names = driver.find_elements(By.CLASS_NAME, "p-programms__item__name")

            if not times or not names:
                print(f"No program data available for {i['stream_name']} on {target_date}.")
                driver.quit()
                continue

            programs = {t.text: n.text for t, n in zip(times, names)}
            if programs:
                daily_programs[f"{i['stream_id']}"] = programs
        except Exception as e:
            print(f"Error extracting program data for {i['stream_name']}: {e}")
        finally:
            driver.quit()

        print(f"{i['stream_name']} finished in {datetime.now()}")

    return daily_programs

def scraping_cbc_sport(date):
    stream_query = f"select * from tv_schedules where stream_id=36 and start_time> '{date}'"
    cbc=db4.select(stream_query)
    programs={}
    if cbc:
        print('There are tv_schedules with this stream')
        return programs
    else:
    
        print(f"Cbc sport started in {datetime.now()}")
        
        url = "https://www.cbcsport.az/"
        response = requests.get(url)
        response.raise_for_status()
        page_content = response.content
        tree = html.fromstring(page_content)
        elements = tree.xpath("(//div[contains(@class, 'swiper-slide')][1]//ul/li)")

        
        for li in elements:
            time_element = li.xpath(".//span[contains(@class, 'd-flex align-items-center')]/text()")
            time = time_element[0].strip() if time_element else "No time found"
            content_element = li.xpath(".//p/text()")
            content = content_element[0].strip() if content_element else "No content found"
            # print(f"Time: {time}, Content: {content}")
            programs[time]=content
        print(f"Cbc sport finished in {datetime.now()}")
        
        return programs


def insert_code_to_programs(daily_programs):
    missing_dict = {}
    insert_programs_query = "INSERT INTO programs (name, stream_id) VALUES (%s, %s)"
    successful_inserts=0

    for stream in daily_programs:
        try:
            missing = set()
            select_programs_query = f"""SELECT * FROM programs WHERE stream_id={stream}"""
            df = db4.select(select_programs_query)

            for _, program_name in daily_programs[stream].items():
                if not any(program_name == record['name'] for record in df):
                    missing.add(program_name)

            missing_dict[stream] = missing
        except Exception as e:
            print(f"Error processing stream {stream}: {e}")
            continue
        total_missing=len_of_dict(missing_dict)

        for program_name in missing:
            try:
                db4.insert(insert_programs_query, (program_name, stream))
                successful_inserts += 1  # Increment only on successful insert
            except Exception as e:
                print(f"Error inserting program '{program_name}' for stream {stream}: {e}")
                
    if successful_inserts == total_missing:
        print("All programs were successfully inserted.")
    else:
        print(f"Some inserts failed. Expected: {total_missing}, Successful: {successful_inserts}")


    return missing_dict



def insert_code_to_tv_schedules(daily_programs):
    todays_date = datetime.now().date()
    target_date = todays_date.strftime('%Y-%m-%d')
    
    successful_inserts =0
    for stream in daily_programs:

        for start_time, program_name in daily_programs[stream].items():
            
            start_datetime = datetime.strptime(f'{target_date} {start_time}', '%Y-%m-%d %H:%M')
        
            insert_schedules = """
                INSERT INTO tv_schedules (start_time, program_id, stream_id)
                SELECT 
                    %s AS start_time, p.id AS program_id, %s AS stream_id
                FROM programs p
                WHERE p.name = %s
                LIMIT 1;
            """

            values = (start_datetime, stream, program_name)
            try:
                print(start_time,program_name,end='/')
                db4.insert(insert_schedules, values)
                successful_inserts += 1
            except Exception as e:
                print(f"Error inserting program '{program_name}' for stream {stream} at {start_time}: {e}")
    total_missing=len_of_dict(daily_programs)
    
    if successful_inserts == total_missing:
        print("All programs were successfully inserted.")
        print(f"Length of Total daily programs in {todays_date} is ",total_missing)
    else:
        print(f"Some inserts failed. Expected: {total_missing}, Successful: {successful_inserts}")
        
    return






In [28]:
# todays_date = datetime.now().date()-timedelta(days=0)

# streams,missing_streams=find_streams(todays_date)

# print('  ')

# daily_programs=get_daily_program(streams)

# print('  ')

# programs=scraping_cbc_sport(todays_date)
# if programs:
#     cbc_sport_stream_id='36'
    
#     daily_programs[cbc_sport_stream_id]=programs
# else:
#     print('Cbc sport scraping is not succesful')

# print('  ')

try:
    for i in daily_programs:
        daily_programs[i]= clean_dict(daily_programs[i])
except ValueError as e:
    print(e)
    
# missing_dict=insert_code_to_programs(daily_programs) 

# insert_code_to_tv_schedules(daily_programs)
    

In [1]:
#-------------------------------------------------

In [3]:
query="select * from stream_scraping_ids where is_mailru_id_active is True"
streams=db4.select(query)

In [5]:
daily_programs=get_daily_program(streams)

AzTV started in 2024-12-06 11:07:53.739671
AzTV finished in 2024-12-06 11:08:08.326176
İctimai TV started in 2024-12-06 11:08:08.326176
İctimai TV finished in 2024-12-06 11:08:32.439794
Mədəniyyət TV started in 2024-12-06 11:08:32.439794
Mədəniyyət TV finished in 2024-12-06 11:08:57.785718
Xəzər TV started in 2024-12-06 11:08:57.785718
Xəzər TV finished in 2024-12-06 11:09:20.066114
CBC started in 2024-12-06 11:09:20.066114
CBC finished in 2024-12-06 11:09:46.778630
Idman TV started in 2024-12-06 11:09:46.778630
No data found for Idman TV on 2024-12-06. Moving to the next stream.


In [9]:
programs={}
url = "https://www.cbcsport.az/"
response = requests.get(url)
response.raise_for_status()
page_content = response.content
tree = html.fromstring(page_content)
elements = tree.xpath("(//div[contains(@class, 'swiper-slide')][1]//ul/li)")


for li in elements:
    time_element = li.xpath(".//span[contains(@class, 'd-flex align-items-center')]/text()")
    time = time_element[0].strip() if time_element else "No time found"
    content_element = li.xpath(".//p/text()")
    content = content_element[0].strip() if content_element else "No content found"
    # print(f"Time: {time}, Content: {content}")
    programs[time]=content
print(f"Cbc sport finished in {datetime.now()}")

cbc_sport_stream_id='36'

daily_programs[cbc_sport_stream_id]=programs

Cbc sport finished in 2024-12-06 11:12:38.790458


In [29]:
len(df)

158

In [27]:
daily_programs

{'25': {'05:00': 'AzTV Xəbər',
  '05:30': 'Səlahəddin Əyyubi',
  '06:30': 'Bəşəri nailiyyətlər',
  '07:00': 'Telesəhər',
  '10:00': 'AzTV Xəbər',
  '10:30': 'Bəşəri nailiyyətlər',
  '11:00': 'Səlahəddin Əyyubi',
  '12:00': 'AzTV Xəbər',
  '12:30': 'Günə davam',
  '14:00': 'AzTV Xəbər',
  '15:00': 'Paytaxt: Əbdülhəmid',
  '16:00': 'AzTV Xəbər',
  '17:00': 'Hədəf',
  '18:00': 'AzTV Xəbər',
  '19:00': 'Region xəbərləri',
  '19:30': 'İqtisadiyyatın yekunu',
  '20:00': 'AzTV Xəbər',
  '21:10': 'Hədəf',
  '23:00': 'Səlahəddin Əyyubi',
  '00:00': 'AzTV Xəbər',
  '00:30': 'Bədii film',
  '02:00': 'Xəbərlər',
  '02:15': 'Bədii film',
  '02:45': 'Sənədli film',
  '03:15': 'Bədii film'},
 '28': {'06:05': 'Bəraət - № 1113. Hüseyn Cavid',
  '07:00': 'İTV Xəbər',
  '07:10': 'Sabahın xeyir, Azərbaycan!',
  '10:00': 'İTV Xəbər',
  '10:20': 'Velvet',
  '12:00': 'İTV Xəbər',
  '12:20': 'Xəbərimiz var',
  '14:00': 'İTV Xəbər',
  '14:20': 'XX Yüzillik. Faktlar, hadisələr və insanlar',
  '15:00': 'Din və c

[]

In [None]:
for i in df:
    name=i['name']
    time=i['start_time']

Bəşəri nailiyyətlər
2024-12-05 06:30:00
Bəşəri nailiyyətlər
2024-12-05 10:30:00
Günə davam
2024-12-05 12:30:00
Paytaxt: Əbdülhəmid
2024-12-05 15:00:00
AzTV Xəbər
2024-12-05 18:00:00
AzTV Xəbər
2024-12-05 20:00:00
AzTV Xəbər
2024-12-05 05:00:00
Telesəhər
2024-12-05 07:00:00
Quruluş Osman
2024-12-05 11:00:00
AzTV Xəbər
2024-12-05 14:00:00
AzTV Xəbər
2024-12-05 16:00:00
Region xəbərləri
2024-12-05 19:00:00
Medialab
2024-12-05 21:00:00
Səlahəddin Əyyubi
2024-12-05 23:00:00
Bədii film
2024-12-05 00:30:00
Bədii film
2024-12-05 02:15:00
Harlemin xaç atası
2024-12-05 06:05:00
Sabahın xeyir, Azərbaycan!
2024-12-05 07:10:00
Velvet
2024-12-05 10:20:00
Xəbərimiz var
2024-12-05 12:20:00
XX Yüzillik
2024-12-05 14:20:00
Mag
2024-12-05 15:30:00
Azərbaycan kinosu
2024-12-05 16:20:00
Ürək dalanı
2024-12-05 18:20:00
Sabaha Saxlamayaq
2024-12-05 21:00:00
Bəraət - № 1113
2024-12-05 23:00:00
Heç kim unudulmur, heç nə yaddan çıxmır
2024-12-05 01:35:00
Sabaha Saxlamayaq
2024-12-05 03:30:00
Hacı Qara
2024-12-0

In [None]:
date= datetime.now().date()
query=f"""select e.*,b.name from tv_schedules e
join programs b
on e.program_id=b.id
where e.start_time>'{date}'"""
    
df=db4.select(query)

df=pd.DataFrame(df)
df['start_time'] = pd.to_datetime(df['start_time'])  # Ensure datetime format
df['start_time_str'] = df['start_time'].dt.strftime('%H:%M')  # Extract time as string

# Identify missing entries
missing_entries = {}
for stream_id, programs in daily_programs.items():
    # Filter the DataFrame for the specific stream_id
    stream_df = df[df['stream_id'] == int(stream_id)]
    
    # Compare each program in daily_programs
    missing_programs = [
        (time, name) for time, name in programs.items()
        if not ((stream_df['start_time_str'] == time) & (stream_df['name'] == name)).any()
    ]
    
    if missing_programs:
        missing_entries[stream_id] = missing_programs

# Output missing entries
print("Missing Entries:")
for stream_id, entries in missing_entries.items():
    print(f"Stream ID: {stream_id}")
    for time, name in entries:
        print(f"  Time: {time}, Program: {name}")

Missing Entries:
Stream ID: 25
  Time: 00:00, Program: AzTV Xəbər
Stream ID: 28
  Time: 23:25, Program: La-La Lend
  Time: 04:10, Program: Rakadaroom
Stream ID: 29
  Time: 11:20, Program: Gülüşlə yoğrulan ömür
  Time: 14:10, Program: Primanın irsi
  Time: 15:20, Program: Sərgüzəşti-vəziri-xani-Lənkəran
  Time: 17:20, Program: Ədibin evi
  Time: 17:30, Program: Heykəllər danışsa (Azad qadın heykəli)
  Time: 00:00, Program: Mədəniyyət xəbərləri
Stream ID: 35
  Time: 00:00, Program: Экономика дня
Stream ID: 36
  Time: 00:00, Program: İtaliya A Seriyası, 14-cü turun oyunlarının icmalı
  Time: 13:30, Program: İtaliya A Seriyasının jurnalı (yeni)
  Time: 15:00, Program: Voleybol, Yüksək Liqa, qadınlar
  Time: 18:00, Program: Türkiyədən futbol (canlı)
  Time: 19:00, Program: Basketbol, Azərbaycan Basketbol Liqası
  Time: 21:15, Program: Futbol, İtaliya A Seriyası, 15-ci tur
  Time: 23:45, Program: Futbol, İtaliya A Seriyası, 15-ci tur


In [60]:
start_time=missing_entries["36"][2][0]
program_name=missing_entries["36"][2][1]
stream=36

In [57]:
todays_date = datetime.now().date()
target_date = todays_date.strftime('%Y-%m-%d')
start_datetime = datetime.strptime(f'{target_date} {start_time}', '%Y-%m-%d %H:%M')
start_datetime = datetime.strptime(f'{target_date} {start_time}', '%Y-%m-%d %H:%M')
        
insert_schedules = """
    INSERT INTO tv_schedules (start_time, program_id, stream_id)
    SELECT 
        %s AS start_time, p.id AS program_id, %s AS stream_id
    FROM programs p
    WHERE p.name = %s
    LIMIT 1;
"""

values = (start_datetime, stream, program_name)

In [58]:
stream

'36'

In [59]:
db4.insert(insert_schedules,values)

Connection to the PostgreSQL database


True

In [52]:
start_datetime

datetime.datetime(2024, 12, 6, 15, 0)

In [55]:
program_name

'Voleybol, Yüksək Liqa, qadınlar'